From cd54c58d8b66c0d716d59e47d9b3fa49e07f9435 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 15 Nov 2010 03:04:51 +0000
Subject: [PATCH] --- yaml --- r: 229065 b: refs/heads/master c:
 009ca3897ea8313b4ed4da964a2f31ecf5a0624d h: refs/heads/master i:   229063:
 89b128c1aec6331a5b9c8dff86f1e2e6c65b1b53 v: v3

---
 [refs]                                        |    2 +-
 trunk/MAINTAINERS                             |    5 +-
 trunk/arch/arm/mach-dove/common.c             |    4 +-
 .../arch/arm/mach-tegra/include/mach/sdhci.h  |   29 -
 trunk/arch/microblaze/Kconfig.debug           |    4 +
 trunk/arch/microblaze/Makefile                |    2 +-
 trunk/arch/microblaze/configs/mmu_defconfig   |    1 +
 trunk/arch/microblaze/include/asm/pvr.h       |  185 +-
 trunk/arch/microblaze/kernel/cpu/cpuinfo.c    |    1 -
 trunk/arch/microblaze/kernel/entry.S          |   46 +-
 trunk/arch/microblaze/kernel/exceptions.c     |    3 -
 .../microblaze/kernel/hw_exception_handler.S  |    9 -
 trunk/arch/microblaze/kernel/prom.c           |    4 +-
 trunk/arch/microblaze/kernel/vmlinux.lds.S    |   16 +-
 trunk/arch/microblaze/lib/memmove.c           |    2 +-
 trunk/arch/microblaze/lib/muldi3.S            |  121 ++
 trunk/arch/microblaze/lib/muldi3.c            |   60 -
 trunk/arch/x86/include/asm/acpi.h             |   11 +-
 trunk/arch/x86/include/asm/amd_nb.h           |   13 +-
 trunk/arch/x86/include/asm/fixmap.h           |    4 +-
 trunk/arch/x86/include/asm/gpio.h             |    5 +-
 trunk/arch/x86/include/asm/kdebug.h           |    1 +
 trunk/arch/x86/include/asm/mach_traps.h       |   12 +-
 trunk/arch/x86/include/asm/nmi.h              |   20 -
 trunk/arch/x86/include/asm/numa_64.h          |    2 +-
 trunk/arch/x86/include/asm/perf_event_p4.h    |    3 -
 trunk/arch/x86/kernel/amd_nb.c                |    7 -
 trunk/arch/x86/kernel/aperture_64.c           |   44 +-
 trunk/arch/x86/kernel/apic/apic.c             |    2 +-
 trunk/arch/x86/kernel/apic/hw_nmi.c           |    3 +-
 trunk/arch/x86/kernel/apic/x2apic_uv_x.c      |    4 +-
 trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c |    5 +-
 trunk/arch/x86/kernel/cpu/perf_event.c        |    3 +-
 trunk/arch/x86/kernel/cpu/perf_event_p4.c     |   28 +-
 trunk/arch/x86/kernel/dumpstack.c             |    6 +
 trunk/arch/x86/kernel/entry_64.S              |   36 +-
 trunk/arch/x86/kernel/kgdb.c                  |    7 +-
 trunk/arch/x86/kernel/reboot.c                |    5 +-
 trunk/arch/x86/kernel/smpboot.c               |    4 +-
 trunk/arch/x86/kernel/traps.c                 |  102 +-
 trunk/arch/x86/kernel/tsc.c                   |    2 +-
 trunk/arch/x86/mm/amdtopology_64.c            |   86 +-
 trunk/arch/x86/mm/numa_64.c                   |  157 +-
 trunk/arch/x86/mm/srat_64.c                   |   26 +-
 trunk/arch/x86/oprofile/nmi_int.c             |    3 +-
 trunk/arch/x86/oprofile/nmi_timer_int.c       |    2 +-
 trunk/arch/x86/pci/amd_bus.c                  |   33 -
 trunk/drivers/char/ipmi/ipmi_watchdog.c       |    2 +-
 trunk/drivers/mfd/sh_mobile_sdhi.c            |    6 -
 trunk/drivers/mmc/card/Kconfig                |    1 -
 trunk/drivers/mmc/core/Kconfig                |   11 -
 trunk/drivers/mmc/core/bus.c                  |    8 +-
 trunk/drivers/mmc/core/core.c                 |  206 +-
 trunk/drivers/mmc/core/core.h                 |    9 +-
 trunk/drivers/mmc/core/debugfs.c              |    5 -
 trunk/drivers/mmc/core/host.c                 |  206 +-
 trunk/drivers/mmc/core/host.h                 |   21 -
 trunk/drivers/mmc/core/mmc.c                  |   91 +-
 trunk/drivers/mmc/core/mmc_ops.c              |  101 -
 trunk/drivers/mmc/core/mmc_ops.h              |    1 -
 trunk/drivers/mmc/core/sd.c                   |   16 +-
 trunk/drivers/mmc/core/sdio.c                 |   36 +-
 trunk/drivers/mmc/core/sdio_bus.c             |   32 +
 trunk/drivers/mmc/host/Kconfig                |   37 -
 trunk/drivers/mmc/host/Makefile               |    3 -
 trunk/drivers/mmc/host/davinci_mmc.c          |   80 +-
 trunk/drivers/mmc/host/dw_mmc.c               | 1796 -----------------
 trunk/drivers/mmc/host/dw_mmc.h               |  168 --
 trunk/drivers/mmc/host/mxcmmc.c               |   53 +-
 trunk/drivers/mmc/host/sdhci-dove.c           |   70 -
 trunk/drivers/mmc/host/sdhci-pci.c            |  161 +-
 trunk/drivers/mmc/host/sdhci-pltfm.c          |    6 -
 trunk/drivers/mmc/host/sdhci-pltfm.h          |    2 -
 trunk/drivers/mmc/host/sdhci-s3c.c            |   66 -
 trunk/drivers/mmc/host/sdhci-tegra.c          |  257 ---
 trunk/drivers/mmc/host/sdhci.c                |   45 +-
 trunk/drivers/mmc/host/sdhci.h                |    3 +-
 trunk/drivers/mmc/host/tmio_mmc.c             |  561 +----
 trunk/drivers/mmc/host/tmio_mmc.h             |  228 +++
 trunk/drivers/rtc/class.c                     |   13 -
 trunk/drivers/rtc/interface.c                 |  574 +++---
 trunk/drivers/rtc/rtc-dev.c                   |  104 +
 trunk/drivers/rtc/rtc-lib.c                   |   28 -
 trunk/drivers/watchdog/hpwdt.c                |    2 +-
 trunk/fs/9p/acl.c                             |    2 +-
 trunk/fs/9p/xattr.c                           |    2 +-
 trunk/fs/ocfs2/Kconfig                        |    2 +-
 trunk/fs/ocfs2/alloc.c                        |   77 +-
 trunk/fs/ocfs2/alloc.h                        |    4 +
 trunk/fs/ocfs2/aops.c                         |   59 +-
 trunk/fs/ocfs2/cluster/heartbeat.c            |  246 +--
 trunk/fs/ocfs2/cluster/netdebug.c             |  286 +--
 trunk/fs/ocfs2/cluster/tcp.c                  |  145 +-
 trunk/fs/ocfs2/cluster/tcp_internal.h         |   33 +-
 trunk/fs/ocfs2/dlm/dlmast.c                   |   76 +-
 trunk/fs/ocfs2/dlm/dlmcommon.h                |   86 +-
 trunk/fs/ocfs2/dlm/dlmdebug.c                 |  200 +-
 trunk/fs/ocfs2/dlm/dlmdebug.h                 |    5 +
 trunk/fs/ocfs2/dlm/dlmdomain.c                |   10 +-
 trunk/fs/ocfs2/dlm/dlmlock.c                  |    3 -
 trunk/fs/ocfs2/dlm/dlmthread.c                |  132 +-
 trunk/fs/ocfs2/namei.c                        |    5 +-
 trunk/fs/ocfs2/ocfs2.h                        |    5 -
 trunk/fs/xfs/linux-2.6/sv.h                   |   59 +
 trunk/fs/xfs/linux-2.6/xfs_aops.c             |  425 ++--
 trunk/fs/xfs/linux-2.6/xfs_aops.h             |   16 -
 trunk/fs/xfs/linux-2.6/xfs_buf.c              |  235 +--
 trunk/fs/xfs/linux-2.6/xfs_buf.h              |   22 +-
 trunk/fs/xfs/linux-2.6/xfs_export.c           |   12 +-
 trunk/fs/xfs/linux-2.6/xfs_linux.h            |    1 +
 trunk/fs/xfs/linux-2.6/xfs_super.c            |   22 +-
 trunk/fs/xfs/linux-2.6/xfs_sync.c             |   92 +-
 trunk/fs/xfs/linux-2.6/xfs_trace.h            |   59 +-
 trunk/fs/xfs/quota/xfs_dquot.c                |    1 +
 trunk/fs/xfs/xfs_ag.h                         |    2 +-
 trunk/fs/xfs/xfs_alloc.c                      |  351 ++--
 trunk/fs/xfs/xfs_attr_leaf.c                  |    4 +-
 trunk/fs/xfs/xfs_btree.c                      |    9 +-
 trunk/fs/xfs/xfs_buf_item.c                   |   32 +-
 trunk/fs/xfs/xfs_buf_item.h                   |   11 +
 trunk/fs/xfs/xfs_extfree_item.c               |   97 +-
 trunk/fs/xfs/xfs_extfree_item.h               |   11 +-
 trunk/fs/xfs/xfs_fsops.c                      |    1 -
 trunk/fs/xfs/xfs_iget.c                       |   79 +-
 trunk/fs/xfs/xfs_inode.c                      |   54 +-
 trunk/fs/xfs/xfs_inode.h                      |   15 +-
 trunk/fs/xfs/xfs_inode_item.c                 |   90 +-
 trunk/fs/xfs/xfs_iomap.c                      |  233 ++-
 trunk/fs/xfs/xfs_iomap.h                      |   27 +-
 trunk/fs/xfs/xfs_log.c                        |  739 ++++---
 trunk/fs/xfs/xfs_log_cil.c                    |   17 +-
 trunk/fs/xfs/xfs_log_priv.h                   |  127 +-
 trunk/fs/xfs/xfs_log_recover.c                |  620 +++---
 trunk/fs/xfs/xfs_mount.c                      |   23 +-
 trunk/fs/xfs/xfs_mount.h                      |   14 -
 trunk/fs/xfs/xfs_trans.c                      |   79 +-
 trunk/fs/xfs/xfs_trans.h                      |    2 +-
 trunk/fs/xfs/xfs_trans_ail.c                  |  232 +--
 trunk/fs/xfs/xfs_trans_extfree.c              |    8 +-
 trunk/fs/xfs/xfs_trans_priv.h                 |   35 +-
 trunk/fs/xfs/xfs_vnodeops.c                   |   61 +-
 trunk/include/linux/dynamic_debug.h           |   18 +-
 trunk/include/linux/mfd/tmio.h                |    5 -
 trunk/include/linux/mmc/dw_mmc.h              |  217 --
 trunk/include/linux/mmc/host.h                |   19 -
 trunk/include/linux/mmc/mmc.h                 |    2 -
 trunk/include/linux/mmc/sdhci.h               |    6 -
 trunk/include/linux/pci_ids.h                 |    8 -
 trunk/include/linux/rtc.h                     |   51 +-
 trunk/include/linux/tracepoint.h              |    4 +-
 trunk/include/trace/define_trace.h            |   10 -
 trunk/include/trace/events/skb.h              |    4 +-
 trunk/kernel/Makefile                         |    1 -
 trunk/kernel/exit.c                           |   14 +-
 trunk/kernel/perf_event.c                     |   82 +-
 trunk/kernel/trace/Makefile                   |    2 +-
 trunk/kernel/trace/trace.c                    |    6 +-
 trunk/lib/dynamic_debug.c                     |    9 +-
 trunk/tools/perf/Makefile                     |    2 +-
 trunk/tools/perf/builtin-record.c             |    3 -
 trunk/tools/perf/builtin-sched.c              |    5 +-
 trunk/tools/perf/builtin-stat.c               |    5 +-
 trunk/tools/perf/builtin-test.c               |  116 +-
 trunk/tools/perf/builtin-top.c                |    2 -
 trunk/tools/perf/util/evsel.c                 |   87 +-
 trunk/tools/perf/util/evsel.h                 |    2 +-
 trunk/tools/perf/util/parse-events.c          |   74 +-
 trunk/tools/perf/util/session.c               |    2 +-
 168 files changed, 3883 insertions(+), 8307 deletions(-)
 delete mode 100644 trunk/arch/arm/mach-tegra/include/mach/sdhci.h
 create mode 100644 trunk/arch/microblaze/lib/muldi3.S
 delete mode 100644 trunk/arch/microblaze/lib/muldi3.c
 delete mode 100644 trunk/drivers/mmc/host/dw_mmc.c
 delete mode 100644 trunk/drivers/mmc/host/dw_mmc.h
 delete mode 100644 trunk/drivers/mmc/host/sdhci-dove.c
 delete mode 100644 trunk/drivers/mmc/host/sdhci-tegra.c
 create mode 100644 trunk/drivers/mmc/host/tmio_mmc.h
 create mode 100644 trunk/fs/xfs/linux-2.6/sv.h
 delete mode 100644 trunk/include/linux/mmc/dw_mmc.h

diff --git a/[refs] b/[refs]
index 73f8edb244a1..d03925fe369c 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 7bc4a4ce68f8c6d064ea949446852e996526f692
+refs/heads/master: 009ca3897ea8313b4ed4da964a2f31ecf5a0624d
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 42f991e5a85d..bb6c1ac85138 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -1785,8 +1785,7 @@ S:	Maintained
 F:	drivers/usb/atm/cxacru.c
 
 CONFIGFS
-M:	Joel Becker <jlbec@evilplan.org>
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/configfs.git
+M:	Joel Becker <joel.becker@oracle.com>
 S:	Supported
 F:	fs/configfs/
 F:	include/linux/configfs.h
@@ -4550,7 +4549,7 @@ F:	include/linux/oprofile.h
 
 ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
 M:	Mark Fasheh <mfasheh@suse.com>
-M:	Joel Becker <jlbec@evilplan.org>
+M:	Joel Becker <joel.becker@oracle.com>
 L:	ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
 W:	http://oss.oracle.com/projects/ocfs2/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2.git
diff --git a/trunk/arch/arm/mach-dove/common.c b/trunk/arch/arm/mach-dove/common.c
index fe627aba6da7..f7a12586a1f5 100644
--- a/trunk/arch/arm/mach-dove/common.c
+++ b/trunk/arch/arm/mach-dove/common.c
@@ -770,7 +770,7 @@ static struct resource dove_sdio0_resources[] = {
 };
 
 static struct platform_device dove_sdio0 = {
-	.name		= "sdhci-dove",
+	.name		= "sdhci-mv",
 	.id		= 0,
 	.dev		= {
 		.dma_mask		= &sdio_dmamask,
@@ -798,7 +798,7 @@ static struct resource dove_sdio1_resources[] = {
 };
 
 static struct platform_device dove_sdio1 = {
-	.name		= "sdhci-dove",
+	.name		= "sdhci-mv",
 	.id		= 1,
 	.dev		= {
 		.dma_mask		= &sdio_dmamask,
diff --git a/trunk/arch/arm/mach-tegra/include/mach/sdhci.h b/trunk/arch/arm/mach-tegra/include/mach/sdhci.h
deleted file mode 100644
index 3ad086e859c3..000000000000
--- a/trunk/arch/arm/mach-tegra/include/mach/sdhci.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * include/asm-arm/arch-tegra/include/mach/sdhci.h
- *
- * Copyright (C) 2009 Palm, Inc.
- * Author: Yvonne Yip <y@palm.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-#ifndef __ASM_ARM_ARCH_TEGRA_SDHCI_H
-#define __ASM_ARM_ARCH_TEGRA_SDHCI_H
-
-#include <linux/mmc/host.h>
-
-struct tegra_sdhci_platform_data {
-	int cd_gpio;
-	int wp_gpio;
-	int power_gpio;
-	int is_8bit;
-};
-
-#endif
diff --git a/trunk/arch/microblaze/Kconfig.debug b/trunk/arch/microblaze/Kconfig.debug
index 012e377330cd..e66e25c4b0b2 100644
--- a/trunk/arch/microblaze/Kconfig.debug
+++ b/trunk/arch/microblaze/Kconfig.debug
@@ -23,4 +23,8 @@ config HEART_BEAT
 	  This option turns on/off heart beat kernel functionality.
 	  First GPIO node is taken.
 
+config DEBUG_BOOTMEM
+	depends on DEBUG_KERNEL
+	bool "Debug BOOTMEM initialization"
+
 endmenu
diff --git a/trunk/arch/microblaze/Makefile b/trunk/arch/microblaze/Makefile
index 6f432e6df9af..15f1f1d1840d 100644
--- a/trunk/arch/microblaze/Makefile
+++ b/trunk/arch/microblaze/Makefile
@@ -17,7 +17,7 @@ export CPU_VER CPU_MAJOR CPU_MINOR CPU_REV
 # The various CONFIG_XILINX cpu features options are integers 0/1/2...
 # rather than bools y/n
 
-# Work out HW multipler support. This is tricky.
+# Work out HW multipler support.  This is icky.
 # 1. Spartan2 has no HW multiplers.
 # 2. MicroBlaze v3.x always uses them, except in Spartan 2
 # 3. All other FPGa/CPU ver combos, we can trust the CONFIG_ settings
diff --git a/trunk/arch/microblaze/configs/mmu_defconfig b/trunk/arch/microblaze/configs/mmu_defconfig
index ab8fbe7ad90b..8b422b12ef78 100644
--- a/trunk/arch/microblaze/configs/mmu_defconfig
+++ b/trunk/arch/microblaze/configs/mmu_defconfig
@@ -66,4 +66,5 @@ CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
 CONFIG_EARLY_PRINTK=y
+CONFIG_DEBUG_BOOTMEM=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/trunk/arch/microblaze/include/asm/pvr.h b/trunk/arch/microblaze/include/asm/pvr.h
index a10bec62e857..37db96a15b45 100644
--- a/trunk/arch/microblaze/include/asm/pvr.h
+++ b/trunk/arch/microblaze/include/asm/pvr.h
@@ -1,9 +1,9 @@
 /*
  * Support for the MicroBlaze PVR (Processor Version Register)
  *
- * Copyright (C) 2009 - 2011 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2009 Michal Simek <monstr@monstr.eu>
  * Copyright (C) 2007 John Williams <john.williams@petalogix.com>
- * Copyright (C) 2007 - 2011 PetaLogix
+ * Copyright (C) 2007 - 2009 PetaLogix
  *
  * This file is subject to the terms and conditions of the GNU General
  * Public License. See the file COPYING in the main directory of this
@@ -46,11 +46,11 @@ struct pvr_s {
 #define PVR2_I_LMB_MASK			0x10000000
 #define PVR2_INTERRUPT_IS_EDGE_MASK	0x08000000
 #define PVR2_EDGE_IS_POSITIVE_MASK	0x04000000
-#define PVR2_D_PLB_MASK			0x02000000 /* new */
-#define PVR2_I_PLB_MASK			0x01000000 /* new */
-#define PVR2_INTERCONNECT		0x00800000 /* new */
-#define PVR2_USE_EXTEND_FSL		0x00080000 /* new */
-#define PVR2_USE_FSL_EXC		0x00040000 /* new */
+#define PVR2_D_PLB_MASK			0x02000000	/* new */
+#define PVR2_I_PLB_MASK			0x01000000	/* new */
+#define PVR2_INTERCONNECT		0x00800000	/* new */
+#define PVR2_USE_EXTEND_FSL		0x00080000	/* new */
+#define PVR2_USE_FSL_EXC		0x00040000	/* new */
 #define PVR2_USE_MSR_INSTR		0x00020000
 #define PVR2_USE_PCMP_INSTR		0x00010000
 #define PVR2_AREA_OPTIMISED		0x00008000
@@ -59,7 +59,7 @@ struct pvr_s {
 #define PVR2_USE_HW_MUL_MASK		0x00001000
 #define PVR2_USE_FPU_MASK		0x00000800
 #define PVR2_USE_MUL64_MASK		0x00000400
-#define PVR2_USE_FPU2_MASK		0x00000200 /* new */
+#define PVR2_USE_FPU2_MASK		0x00000200	/* new */
 #define PVR2_USE_IPLBEXC 		0x00000100
 #define PVR2_USE_DPLBEXC		0x00000080
 #define PVR2_OPCODE_0x0_ILL_MASK	0x00000040
@@ -122,103 +122,96 @@ struct pvr_s {
 
 
 /* PVR access macros */
-#define PVR_IS_FULL(_pvr)	(_pvr.pvr[0] & PVR0_PVR_FULL_MASK)
-#define PVR_USE_BARREL(_pvr)	(_pvr.pvr[0] & PVR0_USE_BARREL_MASK)
-#define PVR_USE_DIV(_pvr)	(_pvr.pvr[0] & PVR0_USE_DIV_MASK)
-#define PVR_USE_HW_MUL(_pvr)	(_pvr.pvr[0] & PVR0_USE_HW_MUL_MASK)
-#define PVR_USE_FPU(_pvr)	(_pvr.pvr[0] & PVR0_USE_FPU_MASK)
-#define PVR_USE_FPU2(_pvr)	(_pvr.pvr[2] & PVR2_USE_FPU2_MASK)
-#define PVR_USE_ICACHE(_pvr)	(_pvr.pvr[0] & PVR0_USE_ICACHE_MASK)
-#define PVR_USE_DCACHE(_pvr)	(_pvr.pvr[0] & PVR0_USE_DCACHE_MASK)
-#define PVR_VERSION(_pvr)	((_pvr.pvr[0] & PVR0_VERSION_MASK) >> 8)
-#define PVR_USER1(_pvr)		(_pvr.pvr[0] & PVR0_USER1_MASK)
-#define PVR_USER2(_pvr)		(_pvr.pvr[1] & PVR1_USER2_MASK)
-
-#define PVR_D_OPB(_pvr)		(_pvr.pvr[2] & PVR2_D_OPB_MASK)
-#define PVR_D_LMB(_pvr)		(_pvr.pvr[2] & PVR2_D_LMB_MASK)
-#define PVR_I_OPB(_pvr)		(_pvr.pvr[2] & PVR2_I_OPB_MASK)
-#define PVR_I_LMB(_pvr)		(_pvr.pvr[2] & PVR2_I_LMB_MASK)
-#define PVR_INTERRUPT_IS_EDGE(_pvr) \
-			(_pvr.pvr[2] & PVR2_INTERRUPT_IS_EDGE_MASK)
-#define PVR_EDGE_IS_POSITIVE(_pvr) \
-			(_pvr.pvr[2] & PVR2_EDGE_IS_POSITIVE_MASK)
-#define PVR_USE_MSR_INSTR(_pvr)		(_pvr.pvr[2] & PVR2_USE_MSR_INSTR)
-#define PVR_USE_PCMP_INSTR(_pvr)	(_pvr.pvr[2] & PVR2_USE_PCMP_INSTR)
-#define PVR_AREA_OPTIMISED(_pvr)	(_pvr.pvr[2] & PVR2_AREA_OPTIMISED)
-#define PVR_USE_MUL64(_pvr)		(_pvr.pvr[2] & PVR2_USE_MUL64_MASK)
-#define PVR_OPCODE_0x0_ILLEGAL(_pvr) \
-			(_pvr.pvr[2] & PVR2_OPCODE_0x0_ILL_MASK)
-#define PVR_UNALIGNED_EXCEPTION(_pvr) \
-			(_pvr.pvr[2] & PVR2_UNALIGNED_EXC_MASK)
-#define PVR_ILL_OPCODE_EXCEPTION(_pvr) \
-			(_pvr.pvr[2] & PVR2_ILL_OPCODE_EXC_MASK)
-#define PVR_IOPB_BUS_EXCEPTION(_pvr) \
-			(_pvr.pvr[2] & PVR2_IOPB_BUS_EXC_MASK)
-#define PVR_DOPB_BUS_EXCEPTION(_pvr) \
-			(_pvr.pvr[2] & PVR2_DOPB_BUS_EXC_MASK)
-#define PVR_DIV_ZERO_EXCEPTION(_pvr) \
-			(_pvr.pvr[2] & PVR2_DIV_ZERO_EXC_MASK)
-#define PVR_FPU_EXCEPTION(_pvr)		(_pvr.pvr[2] & PVR2_FPU_EXC_MASK)
-#define PVR_FSL_EXCEPTION(_pvr)		(_pvr.pvr[2] & PVR2_USE_EXTEND_FSL)
-
-#define PVR_DEBUG_ENABLED(_pvr)		(_pvr.pvr[3] & PVR3_DEBUG_ENABLED_MASK)
-#define PVR_NUMBER_OF_PC_BRK(_pvr) \
-			((_pvr.pvr[3] & PVR3_NUMBER_OF_PC_BRK_MASK) >> 25)
-#define PVR_NUMBER_OF_RD_ADDR_BRK(_pvr) \
-			((_pvr.pvr[3] & PVR3_NUMBER_OF_RD_ADDR_BRK_MASK) >> 19)
-#define PVR_NUMBER_OF_WR_ADDR_BRK(_pvr) \
-			((_pvr.pvr[3] & PVR3_NUMBER_OF_WR_ADDR_BRK_MASK) >> 13)
-#define PVR_FSL_LINKS(_pvr)	((_pvr.pvr[3] & PVR3_FSL_LINKS_MASK) >> 7)
-
-#define PVR_ICACHE_ADDR_TAG_BITS(_pvr) \
-		((_pvr.pvr[4] & PVR4_ICACHE_ADDR_TAG_BITS_MASK) >> 26)
-#define PVR_ICACHE_USE_FSL(_pvr) \
-		(_pvr.pvr[4] & PVR4_ICACHE_USE_FSL_MASK)
-#define PVR_ICACHE_ALLOW_WR(_pvr) \
-		(_pvr.pvr[4] & PVR4_ICACHE_ALLOW_WR_MASK)
-#define PVR_ICACHE_LINE_LEN(_pvr) \
-		(1 << ((_pvr.pvr[4] & PVR4_ICACHE_LINE_LEN_MASK) >> 21))
-#define PVR_ICACHE_BYTE_SIZE(_pvr) \
-		(1 << ((_pvr.pvr[4] & PVR4_ICACHE_BYTE_SIZE_MASK) >> 16))
-
-#define PVR_DCACHE_ADDR_TAG_BITS(_pvr) \
-			((_pvr.pvr[5] & PVR5_DCACHE_ADDR_TAG_BITS_MASK) >> 26)
-#define PVR_DCACHE_USE_FSL(_pvr)	(_pvr.pvr[5] & PVR5_DCACHE_USE_FSL_MASK)
-#define PVR_DCACHE_ALLOW_WR(_pvr) \
-			(_pvr.pvr[5] & PVR5_DCACHE_ALLOW_WR_MASK)
+#define PVR_IS_FULL(pvr)		(pvr.pvr[0] & PVR0_PVR_FULL_MASK)
+#define PVR_USE_BARREL(pvr)		(pvr.pvr[0] & PVR0_USE_BARREL_MASK)
+#define PVR_USE_DIV(pvr)		(pvr.pvr[0] & PVR0_USE_DIV_MASK)
+#define PVR_USE_HW_MUL(pvr)		(pvr.pvr[0] & PVR0_USE_HW_MUL_MASK)
+#define PVR_USE_FPU(pvr)		(pvr.pvr[0] & PVR0_USE_FPU_MASK)
+#define PVR_USE_FPU2(pvr)		(pvr.pvr[2] & PVR2_USE_FPU2_MASK)
+#define PVR_USE_ICACHE(pvr)		(pvr.pvr[0] & PVR0_USE_ICACHE_MASK)
+#define PVR_USE_DCACHE(pvr)		(pvr.pvr[0] & PVR0_USE_DCACHE_MASK)
+#define PVR_VERSION(pvr)	((pvr.pvr[0] & PVR0_VERSION_MASK) >> 8)
+#define PVR_USER1(pvr)			(pvr.pvr[0] & PVR0_USER1_MASK)
+#define PVR_USER2(pvr)			(pvr.pvr[1] & PVR1_USER2_MASK)
+
+#define PVR_D_OPB(pvr)			(pvr.pvr[2] & PVR2_D_OPB_MASK)
+#define PVR_D_LMB(pvr)			(pvr.pvr[2] & PVR2_D_LMB_MASK)
+#define PVR_I_OPB(pvr)			(pvr.pvr[2] & PVR2_I_OPB_MASK)
+#define PVR_I_LMB(pvr)			(pvr.pvr[2] & PVR2_I_LMB_MASK)
+#define PVR_INTERRUPT_IS_EDGE(pvr) \
+			(pvr.pvr[2] & PVR2_INTERRUPT_IS_EDGE_MASK)
+#define PVR_EDGE_IS_POSITIVE(pvr) \
+			(pvr.pvr[2] & PVR2_EDGE_IS_POSITIVE_MASK)
+#define PVR_USE_MSR_INSTR(pvr)		(pvr.pvr[2] & PVR2_USE_MSR_INSTR)
+#define PVR_USE_PCMP_INSTR(pvr)		(pvr.pvr[2] & PVR2_USE_PCMP_INSTR)
+#define PVR_AREA_OPTIMISED(pvr)		(pvr.pvr[2] & PVR2_AREA_OPTIMISED)
+#define PVR_USE_MUL64(pvr)		(pvr.pvr[2] & PVR2_USE_MUL64_MASK)
+#define PVR_OPCODE_0x0_ILLEGAL(pvr) \
+			(pvr.pvr[2] & PVR2_OPCODE_0x0_ILL_MASK)
+#define PVR_UNALIGNED_EXCEPTION(pvr) \
+			(pvr.pvr[2] & PVR2_UNALIGNED_EXC_MASK)
+#define PVR_ILL_OPCODE_EXCEPTION(pvr) \
+			(pvr.pvr[2] & PVR2_ILL_OPCODE_EXC_MASK)
+#define PVR_IOPB_BUS_EXCEPTION(pvr) \
+			(pvr.pvr[2] & PVR2_IOPB_BUS_EXC_MASK)
+#define PVR_DOPB_BUS_EXCEPTION(pvr) \
+			(pvr.pvr[2] & PVR2_DOPB_BUS_EXC_MASK)
+#define PVR_DIV_ZERO_EXCEPTION(pvr) \
+			(pvr.pvr[2] & PVR2_DIV_ZERO_EXC_MASK)
+#define PVR_FPU_EXCEPTION(pvr)		(pvr.pvr[2] & PVR2_FPU_EXC_MASK)
+#define PVR_FSL_EXCEPTION(pvr)		(pvr.pvr[2] & PVR2_USE_EXTEND_FSL)
+
+#define PVR_DEBUG_ENABLED(pvr)		(pvr.pvr[3] & PVR3_DEBUG_ENABLED_MASK)
+#define PVR_NUMBER_OF_PC_BRK(pvr) \
+			((pvr.pvr[3] & PVR3_NUMBER_OF_PC_BRK_MASK) >> 25)
+#define PVR_NUMBER_OF_RD_ADDR_BRK(pvr) \
+			((pvr.pvr[3] & PVR3_NUMBER_OF_RD_ADDR_BRK_MASK) >> 19)
+#define PVR_NUMBER_OF_WR_ADDR_BRK(pvr) \
+			((pvr.pvr[3] & PVR3_NUMBER_OF_WR_ADDR_BRK_MASK) >> 13)
+#define PVR_FSL_LINKS(pvr)	((pvr.pvr[3] & PVR3_FSL_LINKS_MASK) >> 7)
+
+#define PVR_ICACHE_ADDR_TAG_BITS(pvr) \
+			((pvr.pvr[4] & PVR4_ICACHE_ADDR_TAG_BITS_MASK) >> 26)
+#define PVR_ICACHE_USE_FSL(pvr)		(pvr.pvr[4] & PVR4_ICACHE_USE_FSL_MASK)
+#define PVR_ICACHE_ALLOW_WR(pvr)	(pvr.pvr[4] & PVR4_ICACHE_ALLOW_WR_MASK)
+#define PVR_ICACHE_LINE_LEN(pvr) \
+			(1 << ((pvr.pvr[4] & PVR4_ICACHE_LINE_LEN_MASK) >> 21))
+#define PVR_ICACHE_BYTE_SIZE(pvr) \
+			(1 << ((pvr.pvr[4] & PVR4_ICACHE_BYTE_SIZE_MASK) >> 16))
+
+#define PVR_DCACHE_ADDR_TAG_BITS(pvr) \
+			((pvr.pvr[5] & PVR5_DCACHE_ADDR_TAG_BITS_MASK) >> 26)
+#define PVR_DCACHE_USE_FSL(pvr)		(pvr.pvr[5] & PVR5_DCACHE_USE_FSL_MASK)
+#define PVR_DCACHE_ALLOW_WR(pvr)	(pvr.pvr[5] & PVR5_DCACHE_ALLOW_WR_MASK)
 /* FIXME two shifts on one line needs any comment */
-#define PVR_DCACHE_LINE_LEN(_pvr) \
-		(1 << ((_pvr.pvr[5] & PVR5_DCACHE_LINE_LEN_MASK) >> 21))
-#define PVR_DCACHE_BYTE_SIZE(_pvr) \
-		(1 << ((_pvr.pvr[5] & PVR5_DCACHE_BYTE_SIZE_MASK) >> 16))
+#define PVR_DCACHE_LINE_LEN(pvr) \
+			(1 << ((pvr.pvr[5] & PVR5_DCACHE_LINE_LEN_MASK) >> 21))
+#define PVR_DCACHE_BYTE_SIZE(pvr) \
+			(1 << ((pvr.pvr[5] & PVR5_DCACHE_BYTE_SIZE_MASK) >> 16))
 
-#define PVR_DCACHE_USE_WRITEBACK(_pvr) \
-			((_pvr.pvr[5] & PVR5_DCACHE_USE_WRITEBACK) >> 14)
+#define PVR_DCACHE_USE_WRITEBACK(pvr) \
+			((pvr.pvr[5] & PVR5_DCACHE_USE_WRITEBACK) >> 14)
 
-#define PVR_ICACHE_BASEADDR(_pvr) \
-			(_pvr.pvr[6] & PVR6_ICACHE_BASEADDR_MASK)
-#define PVR_ICACHE_HIGHADDR(_pvr) \
-			(_pvr.pvr[7] & PVR7_ICACHE_HIGHADDR_MASK)
-#define PVR_DCACHE_BASEADDR(_pvr) \
-			(_pvr.pvr[8] & PVR8_DCACHE_BASEADDR_MASK)
-#define PVR_DCACHE_HIGHADDR(_pvr) \
-			(_pvr.pvr[9] & PVR9_DCACHE_HIGHADDR_MASK)
+#define PVR_ICACHE_BASEADDR(pvr)	(pvr.pvr[6] & PVR6_ICACHE_BASEADDR_MASK)
+#define PVR_ICACHE_HIGHADDR(pvr)	(pvr.pvr[7] & PVR7_ICACHE_HIGHADDR_MASK)
 
-#define PVR_TARGET_FAMILY(_pvr) \
-			((_pvr.pvr[10] & PVR10_TARGET_FAMILY_MASK) >> 24)
+#define PVR_DCACHE_BASEADDR(pvr)	(pvr.pvr[8] & PVR8_DCACHE_BASEADDR_MASK)
+#define PVR_DCACHE_HIGHADDR(pvr)	(pvr.pvr[9] & PVR9_DCACHE_HIGHADDR_MASK)
 
-#define PVR_MSR_RESET_VALUE(_pvr) \
-			(_pvr.pvr[11] & PVR11_MSR_RESET_VALUE_MASK)
+#define PVR_TARGET_FAMILY(pvr)	((pvr.pvr[10] & PVR10_TARGET_FAMILY_MASK) >> 24)
+
+#define PVR_MSR_RESET_VALUE(pvr) \
+				(pvr.pvr[11] & PVR11_MSR_RESET_VALUE_MASK)
 
 /* mmu */
-#define PVR_USE_MMU(_pvr)		((_pvr.pvr[11] & PVR11_USE_MMU) >> 30)
-#define PVR_MMU_ITLB_SIZE(_pvr)		(_pvr.pvr[11] & PVR11_MMU_ITLB_SIZE)
-#define PVR_MMU_DTLB_SIZE(_pvr)		(_pvr.pvr[11] & PVR11_MMU_DTLB_SIZE)
-#define PVR_MMU_TLB_ACCESS(_pvr)	(_pvr.pvr[11] & PVR11_MMU_TLB_ACCESS)
-#define PVR_MMU_ZONES(_pvr)		(_pvr.pvr[11] & PVR11_MMU_ZONES)
+#define PVR_USE_MMU(pvr)	((pvr.pvr[11] & PVR11_USE_MMU) >> 30)
+#define PVR_MMU_ITLB_SIZE(pvr)	(pvr.pvr[11] & PVR11_MMU_ITLB_SIZE)
+#define PVR_MMU_DTLB_SIZE(pvr)	(pvr.pvr[11] & PVR11_MMU_DTLB_SIZE)
+#define PVR_MMU_TLB_ACCESS(pvr)	(pvr.pvr[11] & PVR11_MMU_TLB_ACCESS)
+#define PVR_MMU_ZONES(pvr)	(pvr.pvr[11] & PVR11_MMU_ZONES)
 
 /* endian */
-#define PVR_ENDIAN(_pvr)	(_pvr.pvr[0] & PVR0_ENDI)
+#define PVR_ENDIAN(pvr)	(pvr.pvr[0] & PVR0_ENDI)
 
 int cpu_has_pvr(void);
 void get_pvr(struct pvr_s *pvr);
diff --git a/trunk/arch/microblaze/kernel/cpu/cpuinfo.c b/trunk/arch/microblaze/kernel/cpu/cpuinfo.c
index 2c309fccf230..87c79fa275c3 100644
--- a/trunk/arch/microblaze/kernel/cpu/cpuinfo.c
+++ b/trunk/arch/microblaze/kernel/cpu/cpuinfo.c
@@ -32,7 +32,6 @@ const struct cpu_ver_key cpu_ver_lookup[] = {
 	{"7.30.a", 0x10},
 	{"7.30.b", 0x11},
 	{"8.00.a", 0x12},
-	{"8.00.b", 0x13},
 	{NULL, 0},
 };
 
diff --git a/trunk/arch/microblaze/kernel/entry.S b/trunk/arch/microblaze/kernel/entry.S
index 41c30cdb2704..819238b8a429 100644
--- a/trunk/arch/microblaze/kernel/entry.S
+++ b/trunk/arch/microblaze/kernel/entry.S
@@ -287,44 +287,25 @@
  * are masked. This is nice, means we don't have to CLI before state save
  */
 C_ENTRY(_user_exception):
-	swi	r1, r0, TOPHYS(PER_CPU(ENTRY_SP)) /* save stack */
 	addi	r14, r14, 4	/* return address is 4 byte after call */
+	swi	r1, r0, TOPHYS(PER_CPU(ENTRY_SP)) /* save stack */
 
-	mfs	r1, rmsr
-	nop
-	andi	r1, r1, MSR_UMS
-	bnei	r1, 1f
-
-/* Kernel-mode state save - kernel execve */
-	lwi	r1, r0, TOPHYS(PER_CPU(ENTRY_SP)); /* Reload kernel stack-ptr*/
-	tophys(r1,r1);
-
-	addik	r1, r1, -STATE_SAVE_SIZE; /* Make room on the stack. */
-	SAVE_REGS
-
-	swi	r1, r1, PTO + PT_MODE; /* pt_regs -> kernel mode */
-	brid	2f;
-	nop;				/* Fill delay slot */
-
-/* User-mode state save.  */
-1:
 	lwi	r1, r0, TOPHYS(PER_CPU(CURRENT_SAVE)); /* get saved current */
 	tophys(r1,r1);
 	lwi	r1, r1, TS_THREAD_INFO;	/* get stack from task_struct */
-/* calculate kernel stack pointer from task struct 8k */
-	addik	r1, r1, THREAD_SIZE;
-	tophys(r1,r1);
-
-	addik	r1, r1, -STATE_SAVE_SIZE; /* Make room on the stack.  */
+	/* MS these three instructions can be added to one */
+	/* addik	r1, r1, THREAD_SIZE; */
+	/* tophys(r1,r1); */
+	/* addik	r1, r1, -STATE_SAVE_SIZE; */
+	addik r1, r1, THREAD_SIZE + CONFIG_KERNEL_BASE_ADDR - CONFIG_KERNEL_START - STATE_SAVE_SIZE;
 	SAVE_REGS
 	swi	r0, r1, PTO + PT_R3
 	swi	r0, r1, PTO + PT_R4
 
-	swi	r0, r1, PTO + PT_MODE;			/* Was in user-mode. */
 	lwi	r11, r0, TOPHYS(PER_CPU(ENTRY_SP));
 	swi	r11, r1, PTO+PT_R1;		/* Store user SP.  */
 	clear_ums;
-2:	lwi	CURRENT_TASK, r0, TOPHYS(PER_CPU(CURRENT_SAVE));
+	lwi	CURRENT_TASK, r0, TOPHYS(PER_CPU(CURRENT_SAVE));
 	/* Save away the syscall number.  */
 	swi	r12, r1, PTO+PT_R0;
 	tovirt(r1,r1)
@@ -394,9 +375,6 @@ C_ENTRY(ret_from_trap):
 	swi	r3, r1, PTO + PT_R3
 	swi	r4, r1, PTO + PT_R4
 
-	lwi	r11, r1, PTO + PT_MODE;
-/* See if returning to kernel mode, if so, skip resched &c.  */
-	bnei	r11, 2f;
 	/* We're returning to user mode, so check for various conditions that
 	 * trigger rescheduling. */
 	/* FIXME: Restructure all these flag checks. */
@@ -439,16 +417,6 @@ C_ENTRY(ret_from_trap):
 	RESTORE_REGS;
 	addik	r1, r1, STATE_SAVE_SIZE		/* Clean up stack space.  */
 	lwi	r1, r1, PT_R1 - PT_SIZE;/* Restore user stack pointer. */
-	bri	6f;
-
-/* Return to kernel state.  */
-2:	set_bip;			/*  Ints masked for state restore */
-	VM_OFF;
-	tophys(r1,r1);
-	RESTORE_REGS;
-	addik	r1, r1, STATE_SAVE_SIZE		/* Clean up stack space.  */
-	tovirt(r1,r1);
-6:
 TRAP_return:		/* Make global symbol for debugging */
 	rtbd	r14, 0;	/* Instructions to return from an IRQ */
 	nop;
diff --git a/trunk/arch/microblaze/kernel/exceptions.c b/trunk/arch/microblaze/kernel/exceptions.c
index a7fa6ae76d89..478f2943ede7 100644
--- a/trunk/arch/microblaze/kernel/exceptions.c
+++ b/trunk/arch/microblaze/kernel/exceptions.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <asm/current.h>
-#include <asm/cacheflush.h>
 
 #define MICROBLAZE_ILL_OPCODE_EXCEPTION	0x02
 #define MICROBLAZE_IBUS_EXCEPTION	0x03
@@ -53,8 +52,6 @@ void die(const char *str, struct pt_regs *fp, long err)
 void sw_exception(struct pt_regs *regs)
 {
 	_exception(SIGTRAP, regs, TRAP_BRKPT, regs->r16);
-	flush_dcache_range(regs->r16, regs->r16 + 0x4);
-	flush_icache_range(regs->r16, regs->r16 + 0x4);
 }
 
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
diff --git a/trunk/arch/microblaze/kernel/hw_exception_handler.S b/trunk/arch/microblaze/kernel/hw_exception_handler.S
index 25f6e07d8de8..781195438ee6 100644
--- a/trunk/arch/microblaze/kernel/hw_exception_handler.S
+++ b/trunk/arch/microblaze/kernel/hw_exception_handler.S
@@ -945,20 +945,11 @@ store3:	sbi	r3, r4, 2;
 store4:	sbi	r3, r4, 3;	/* Delay slot */
 ex_shw_vm:
 	/* Store the lower half-word, byte-by-byte into destination address */
-#ifdef __MICROBLAZEEL__
-	lbui	r3, r5, 0;
-store5:	sbi	r3, r4, 0;
-	lbui	r3, r5, 1;
-	brid	ret_from_exc;
-store6:	sbi	r3, r4, 1;	/* Delay slot */
-#else
 	lbui	r3, r5, 2;
 store5:	sbi	r3, r4, 0;
 	lbui	r3, r5, 3;
 	brid	ret_from_exc;
 store6:	sbi	r3, r4, 1;	/* Delay slot */
-#endif
-
 ex_sw_end_vm:			/* Exception handling of store word, ends. */
 
 /* We have to prevent cases that get/put_user macros get unaligned pointer
diff --git a/trunk/arch/microblaze/kernel/prom.c b/trunk/arch/microblaze/kernel/prom.c
index c881393f07fd..a105301e2b7f 100644
--- a/trunk/arch/microblaze/kernel/prom.c
+++ b/trunk/arch/microblaze/kernel/prom.c
@@ -61,12 +61,14 @@ static int __init early_init_dt_scan_serial(unsigned long node,
 	char *p;
 	int *addr;
 
-	pr_debug("search \"serial\", depth: %d, uname: %s\n", depth, uname);
+	pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
 
 /* find all serial nodes */
 	if (strncmp(uname, "serial", 6) != 0)
 		return 0;
 
+	early_init_dt_check_for_initrd(node);
+
 /* find compatible node with uartlite */
 	p = of_get_flat_dt_prop(node, "compatible", &l);
 	if ((strncmp(p, "xlnx,xps-uartlite", 17) != 0) &&
diff --git a/trunk/arch/microblaze/kernel/vmlinux.lds.S b/trunk/arch/microblaze/kernel/vmlinux.lds.S
index 3451bdec9f05..96a88c31fe48 100644
--- a/trunk/arch/microblaze/kernel/vmlinux.lds.S
+++ b/trunk/arch/microblaze/kernel/vmlinux.lds.S
@@ -123,10 +123,20 @@ SECTIONS {
 
 	__init_end_before_initramfs = .;
 
-	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-		INIT_RAM_FS
+	.init.ramfs ALIGN(PAGE_SIZE) : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+		. = ALIGN(4);
+		LONG(0);
+/*
+ * FIXME this can break initramfs for MMU.
+ * Pad init.ramfs up to page boundary,
+ * so that __init_end == __bss_start. This will make image.elf
+ * consistent with the image.bin
+ */
+		/* . = ALIGN(PAGE_SIZE); */
 	}
-
 	__init_end = .;
 
 	.bss ALIGN (PAGE_SIZE) : AT(ADDR(.bss) - LOAD_OFFSET) {
diff --git a/trunk/arch/microblaze/lib/memmove.c b/trunk/arch/microblaze/lib/memmove.c
index 810fd68775e3..123e3616f2dd 100644
--- a/trunk/arch/microblaze/lib/memmove.c
+++ b/trunk/arch/microblaze/lib/memmove.c
@@ -182,7 +182,7 @@ void *memmove(void *v_dst, const void *v_src, __kernel_size_t c)
 			for (; c >= 4; c -= 4) {
 				value = *--i_src;
 				*--i_dst = buf_hold | ((value & 0xFF000000)>> 24);
-				buf_hold = (value & 0xFFFFFF) << 8;
+				buf_hold = (value & 0xFFFFFF) << 8;;
 			}
 #endif
 			/* Realign the source */
diff --git a/trunk/arch/microblaze/lib/muldi3.S b/trunk/arch/microblaze/lib/muldi3.S
new file mode 100644
index 000000000000..ceeaa8c407f2
--- /dev/null
+++ b/trunk/arch/microblaze/lib/muldi3.S
@@ -0,0 +1,121 @@
+#include <linux/linkage.h>
+
+/*
+ * Multiply operation for 64 bit integers, for devices with hard multiply
+ *	Input :	Operand1[H] in Reg r5
+ *		Operand1[L] in Reg r6
+ *		Operand2[H] in Reg r7
+ *		Operand2[L] in Reg r8
+ *	Output: Result[H] in Reg r3
+ *		Result[L] in Reg r4
+ *
+ * Explaination:
+ *
+ *	Both the input numbers are divided into 16 bit number as follows
+ *		op1 = A B C D
+ *		op2 = E F G H
+ *	result = D * H
+ *		 + (C * H + D * G) << 16
+ *		 + (B * H + C * G + D * F) << 32
+ *		 + (A * H + B * G + C * F + D * E) << 48
+ *
+ *	Only 64 bits of the output are considered
+ */
+
+	.text
+	.globl	__muldi3
+	.type __muldi3, @function
+	.ent __muldi3
+
+__muldi3:
+	addi	r1, r1, -40
+
+/* Save the input operands on the caller's stack */
+	swi	r5, r1, 44
+	swi	r6, r1, 48
+	swi	r7, r1, 52
+	swi	r8, r1, 56
+
+/* Store all the callee saved registers */
+	sw	r20, r1, r0
+	swi	r21, r1, 4
+	swi	r22, r1, 8
+	swi	r23, r1, 12
+	swi	r24, r1, 16
+	swi	r25, r1, 20
+	swi	r26, r1, 24
+	swi	r27, r1, 28
+
+/* Load all the 16 bit values for A thru H */
+	lhui	r20, r1, 44 /* A */
+	lhui	r21, r1, 46 /* B */
+	lhui	r22, r1, 48 /* C */
+	lhui	r23, r1, 50 /* D */
+	lhui	r24, r1, 52 /* E */
+	lhui	r25, r1, 54 /* F */
+	lhui	r26, r1, 56 /* G */
+	lhui	r27, r1, 58 /* H */
+
+/* D * H ==> LSB of the result on stack ==> Store1 */
+	mul	r9, r23, r27
+	swi	r9, r1, 36 /* Pos2 and Pos3 */
+
+/* Hi (Store1) + C * H + D * G ==> Store2 ==> Pos1 and Pos2 */
+/* Store the carry generated in position 2 for Pos 3 */
+	lhui	r11, r1, 36 /* Pos2 */
+	mul	r9, r22, r27 /* C * H */
+	mul	r10, r23, r26 /* D * G */
+	add	r9, r9, r10
+	addc	r12, r0, r0
+	add	r9, r9, r11
+	addc	r12, r12, r0 /* Store the Carry */
+	shi	r9, r1, 36 /* Store Pos2 */
+	swi	r9, r1, 32
+	lhui	r11, r1, 32
+	shi	r11, r1, 34 /* Store Pos1 */
+
+/* Hi (Store2) + B * H + C * G + D * F ==> Store3 ==> Pos0 and Pos1 */
+	mul	r9, r21, r27 /* B * H */
+	mul	r10, r22, r26 /* C * G */
+	mul	r7, r23, r25 /* D * F */
+	add	r9, r9, r11
+	add	r9, r9, r10
+	add	r9, r9, r7
+	swi	r9, r1, 32 /* Pos0 and Pos1 */
+
+/* Hi (Store3) + A * H + B * G + C * F + D * E ==> Store3 ==> Pos0 */
+	lhui	r11, r1, 32 /* Pos0 */
+	mul	r9, r20, r27 /* A * H */
+	mul	r10, r21, r26 /* B * G */
+	mul	r7, r22, r25 /* C * F */
+	mul	r8, r23, r24 /* D * E */
+	add	r9, r9, r11
+	add 	r9, r9, r10
+	add	r9, r9, r7
+	add	r9, r9, r8
+	sext16	r9, r9 /* Sign extend the MSB */
+	shi	r9, r1, 32
+
+/* Move results to r3 and r4 */
+	lhui	r3, r1, 32
+	add	r3, r3, r12
+	shi	r3, r1, 32
+	lwi	r3, r1, 32 /* Hi Part */
+	lwi	r4, r1, 36 /* Lo Part */
+
+/* Restore Callee saved registers */
+	lw	r20, r1, r0
+	lwi	r21, r1, 4
+	lwi	r22, r1, 8
+	lwi	r23, r1, 12
+	lwi	r24, r1, 16
+	lwi	r25, r1, 20
+	lwi	r26, r1, 24
+	lwi	r27, r1, 28
+
+/* Restore Frame and return */
+	rtsd	r15, 8
+	addi	r1, r1, 40
+
+.size __muldi3, . - __muldi3
+.end __muldi3
diff --git a/trunk/arch/microblaze/lib/muldi3.c b/trunk/arch/microblaze/lib/muldi3.c
deleted file mode 100644
index d4860e154d29..000000000000
--- a/trunk/arch/microblaze/lib/muldi3.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <linux/module.h>
-
-#include "libgcc.h"
-
-#define DWtype long long
-#define UWtype unsigned long
-#define UHWtype unsigned short
-
-#define W_TYPE_SIZE 32
-
-#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
-#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
-#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
-
-/* If we still don't have umul_ppmm, define it using plain C.  */
-#if !defined(umul_ppmm)
-#define umul_ppmm(w1, w0, u, v)						\
-	do {								\
-		UWtype __x0, __x1, __x2, __x3;				\
-		UHWtype __ul, __vl, __uh, __vh;				\
-									\
-		__ul = __ll_lowpart(u);					\
-		__uh = __ll_highpart(u);				\
-		__vl = __ll_lowpart(v);					\
-		__vh = __ll_highpart(v);				\
-									\
-		__x0 = (UWtype) __ul * __vl;				\
-		__x1 = (UWtype) __ul * __vh;				\
-		__x2 = (UWtype) __uh * __vl;				\
-		__x3 = (UWtype) __uh * __vh;				\
-									\
-		__x1 += __ll_highpart(__x0); /* this can't give carry */\
-		__x1 += __x2; /* but this indeed can */			\
-		if (__x1 < __x2) /* did we get it? */			\
-		__x3 += __ll_B; /* yes, add it in the proper pos */	\
-									\
-		(w1) = __x3 + __ll_highpart(__x1);			\
-		(w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0);\
-	} while (0)
-#endif
-
-#if !defined(__umulsidi3)
-#define __umulsidi3(u, v) ({				\
-	DWunion __w;					\
-	umul_ppmm(__w.s.high, __w.s.low, u, v);		\
-	__w.ll;						\
-	})
-#endif
-
-DWtype __muldi3(DWtype u, DWtype v)
-{
-	const DWunion uu = {.ll = u};
-	const DWunion vv = {.ll = v};
-	DWunion w = {.ll = __umulsidi3(uu.s.low, vv.s.low)};
-
-	w.s.high += ((UWtype) uu.s.low * (UWtype) vv.s.high
-		+ (UWtype) uu.s.high * (UWtype) vv.s.low);
-
-	return w.ll;
-}
diff --git a/trunk/arch/x86/include/asm/acpi.h b/trunk/arch/x86/include/asm/acpi.h
index 211ca3f7fd16..55d106b5e31b 100644
--- a/trunk/arch/x86/include/asm/acpi.h
+++ b/trunk/arch/x86/include/asm/acpi.h
@@ -185,16 +185,17 @@ struct bootnode;
 
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
-extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
-				unsigned long end);
+extern int acpi_get_nodes(struct bootnode *physnodes);
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-
-#ifdef CONFIG_NUMA_EMU
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
 				   int num_nodes);
+#else
+static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
+				   int num_nodes)
+{
+}
 #endif
-#endif /* CONFIG_ACPI_NUMA */
 
 #define acpi_unlazy_tlb(x)	leave_mm(x)
 
diff --git a/trunk/arch/x86/include/asm/amd_nb.h b/trunk/arch/x86/include/asm/amd_nb.h
index 64dc82ee19f0..6aee50d655d1 100644
--- a/trunk/arch/x86/include/asm/amd_nb.h
+++ b/trunk/arch/x86/include/asm/amd_nb.h
@@ -3,27 +3,16 @@
 
 #include <linux/pci.h>
 
-struct amd_nb_bus_dev_range {
-	u8 bus;
-	u8 dev_base;
-	u8 dev_limit;
-};
-
 extern struct pci_device_id amd_nb_misc_ids[];
-extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
 struct bootnode;
 
 extern int early_is_amd_nb(u32 value);
 extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
+extern int amd_get_nodes(struct bootnode *nodes);
 extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int amd_scan_nodes(void);
 
-#ifdef CONFIG_NUMA_EMU
-extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
-extern void amd_get_nodes(struct bootnode *nodes);
-#endif
-
 struct amd_northbridge {
 	struct pci_dev *misc;
 };
diff --git a/trunk/arch/x86/include/asm/fixmap.h b/trunk/arch/x86/include/asm/fixmap.h
index 4729b2b63117..0141b234406f 100644
--- a/trunk/arch/x86/include/asm/fixmap.h
+++ b/trunk/arch/x86/include/asm/fixmap.h
@@ -116,11 +116,11 @@ enum fixed_addresses {
 #endif
 	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
 	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+	__end_of_permanent_fixed_addresses,
+
 #ifdef	CONFIG_X86_MRST
 	FIX_LNW_VRTC,
 #endif
-	__end_of_permanent_fixed_addresses,
-
 	/*
 	 * 256 temporary boot-time mappings, used by early_ioremap(),
 	 * before ioremap() is functional.
diff --git a/trunk/arch/x86/include/asm/gpio.h b/trunk/arch/x86/include/asm/gpio.h
index 91d915a65259..49dbfdfa50f9 100644
--- a/trunk/arch/x86/include/asm/gpio.h
+++ b/trunk/arch/x86/include/asm/gpio.h
@@ -38,9 +38,12 @@ static inline int gpio_cansleep(unsigned int gpio)
 	return __gpio_cansleep(gpio);
 }
 
+/*
+ * Not implemented, yet.
+ */
 static inline int gpio_to_irq(unsigned int gpio)
 {
-	return __gpio_to_irq(gpio);
+	return -ENOSYS;
 }
 
 static inline int irq_to_gpio(unsigned int irq)
diff --git a/trunk/arch/x86/include/asm/kdebug.h b/trunk/arch/x86/include/asm/kdebug.h
index ca242d35e873..f23eb2528464 100644
--- a/trunk/arch/x86/include/asm/kdebug.h
+++ b/trunk/arch/x86/include/asm/kdebug.h
@@ -18,6 +18,7 @@ enum die_val {
 	DIE_TRAP,
 	DIE_GPF,
 	DIE_CALL,
+	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
 	DIE_NMIUNKNOWN,
 };
diff --git a/trunk/arch/x86/include/asm/mach_traps.h b/trunk/arch/x86/include/asm/mach_traps.h
index 72a8b52e7dfd..f7920601e472 100644
--- a/trunk/arch/x86/include/asm/mach_traps.h
+++ b/trunk/arch/x86/include/asm/mach_traps.h
@@ -7,19 +7,9 @@
 
 #include <asm/mc146818rtc.h>
 
-#define NMI_REASON_PORT		0x61
-
-#define NMI_REASON_SERR		0x80
-#define NMI_REASON_IOCHK	0x40
-#define NMI_REASON_MASK		(NMI_REASON_SERR | NMI_REASON_IOCHK)
-
-#define NMI_REASON_CLEAR_SERR	0x04
-#define NMI_REASON_CLEAR_IOCHK	0x08
-#define NMI_REASON_CLEAR_MASK	0x0f
-
 static inline unsigned char get_nmi_reason(void)
 {
-	return inb(NMI_REASON_PORT);
+	return inb(0x61);
 }
 
 static inline void reassert_nmi(void)
diff --git a/trunk/arch/x86/include/asm/nmi.h b/trunk/arch/x86/include/asm/nmi.h
index c76f5b92b840..c4021b953510 100644
--- a/trunk/arch/x86/include/asm/nmi.h
+++ b/trunk/arch/x86/include/asm/nmi.h
@@ -23,26 +23,6 @@ void arch_trigger_all_cpu_backtrace(void);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 #endif
 
-/*
- * Define some priorities for the nmi notifier call chain.
- *
- * Create a local nmi bit that has a higher priority than
- * external nmis, because the local ones are more frequent.
- *
- * Also setup some default high/normal/low settings for
- * subsystems to registers with.  Using 4 bits to seperate
- * the priorities.  This can go alot higher if needed be.
- */
-
-#define NMI_LOCAL_SHIFT		16	/* randomly picked */
-#define NMI_LOCAL_BIT		(1ULL << NMI_LOCAL_SHIFT)
-#define NMI_HIGH_PRIOR		(1ULL << 8)
-#define NMI_NORMAL_PRIOR	(1ULL << 4)
-#define NMI_LOW_PRIOR		(1ULL << 0)
-#define NMI_LOCAL_HIGH_PRIOR	(NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
-#define NMI_LOCAL_NORMAL_PRIOR	(NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
-#define NMI_LOCAL_LOW_PRIOR	(NMI_LOCAL_BIT | NMI_LOW_PRIOR)
-
 void stop_nmi(void);
 void restart_nmi(void);
 
diff --git a/trunk/arch/x86/include/asm/numa_64.h b/trunk/arch/x86/include/asm/numa_64.h
index 5ae87285a502..823e070e7c26 100644
--- a/trunk/arch/x86/include/asm/numa_64.h
+++ b/trunk/arch/x86/include/asm/numa_64.h
@@ -38,7 +38,7 @@ extern void __cpuinit numa_add_cpu(int cpu);
 extern void __cpuinit numa_remove_cpu(int cpu);
 
 #ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
+#define FAKE_NODE_MIN_SIZE	((u64)64 << 20)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
 #endif /* CONFIG_NUMA_EMU */
 #else
diff --git a/trunk/arch/x86/include/asm/perf_event_p4.h b/trunk/arch/x86/include/asm/perf_event_p4.h
index e2f6a99f14ab..295e2ff18a6a 100644
--- a/trunk/arch/x86/include/asm/perf_event_p4.h
+++ b/trunk/arch/x86/include/asm/perf_event_p4.h
@@ -20,9 +20,6 @@
 #define ARCH_P4_MAX_ESCR	(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
 #define ARCH_P4_MAX_CCCR	(18)
 
-#define ARCH_P4_CNTRVAL_BITS	(40)
-#define ARCH_P4_CNTRVAL_MASK	((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
-
 #define P4_ESCR_EVENT_MASK	0x7e000000U
 #define P4_ESCR_EVENT_SHIFT	25
 #define P4_ESCR_EVENTMASK_MASK	0x01fffe00U
diff --git a/trunk/arch/x86/kernel/amd_nb.c b/trunk/arch/x86/kernel/amd_nb.c
index 0a99f7198bc3..affacb5e0065 100644
--- a/trunk/arch/x86/kernel/amd_nb.c
+++ b/trunk/arch/x86/kernel/amd_nb.c
@@ -20,13 +20,6 @@ struct pci_device_id amd_nb_misc_ids[] = {
 };
 EXPORT_SYMBOL(amd_nb_misc_ids);
 
-const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
-	{ 0x00, 0x18, 0x20 },
-	{ 0xff, 0x00, 0x20 },
-	{ 0xfe, 0x00, 0x20 },
-	{ }
-};
-
 struct amd_northbridge_info amd_northbridges;
 EXPORT_SYMBOL(amd_northbridges);
 
diff --git a/trunk/arch/x86/kernel/aperture_64.c b/trunk/arch/x86/kernel/aperture_64.c
index 5955a7800a96..dcd7c83e1659 100644
--- a/trunk/arch/x86/kernel/aperture_64.c
+++ b/trunk/arch/x86/kernel/aperture_64.c
@@ -39,6 +39,18 @@ int fallback_aper_force __initdata;
 
 int fix_aperture __initdata = 1;
 
+struct bus_dev_range {
+	int bus;
+	int dev_base;
+	int dev_limit;
+};
+
+static struct bus_dev_range bus_dev_ranges[] __initdata = {
+	{ 0x00, 0x18, 0x20},
+	{ 0xff, 0x00, 0x20},
+	{ 0xfe, 0x00, 0x20}
+};
+
 static struct resource gart_resource = {
 	.name	= "GART",
 	.flags	= IORESOURCE_MEM,
@@ -282,13 +294,13 @@ void __init early_gart_iommu_check(void)
 	search_agp_bridge(&agp_aper_order, &valid_agp);
 
 	fix = 0;
-	for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
 		int bus;
 		int dev_base, dev_limit;
 
-		bus = amd_nb_bus_dev_ranges[i].bus;
-		dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-		dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
 
 		for (slot = dev_base; slot < dev_limit; slot++) {
 			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -337,13 +349,13 @@ void __init early_gart_iommu_check(void)
 		return;
 
 	/* disable them all at first */
-	for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
 		int bus;
 		int dev_base, dev_limit;
 
-		bus = amd_nb_bus_dev_ranges[i].bus;
-		dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-		dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
 
 		for (slot = dev_base; slot < dev_limit; slot++) {
 			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -378,14 +390,14 @@ int __init gart_iommu_hole_init(void)
 
 	fix = 0;
 	node = 0;
-	for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
 		int bus;
 		int dev_base, dev_limit;
 		u32 ctl;
 
-		bus = amd_nb_bus_dev_ranges[i].bus;
-		dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-		dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
 
 		for (slot = dev_base; slot < dev_limit; slot++) {
 			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -493,7 +505,7 @@ int __init gart_iommu_hole_init(void)
 	}
 
 	/* Fix up the north bridges */
-	for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
 		int bus, dev_base, dev_limit;
 
 		/*
@@ -502,9 +514,9 @@ int __init gart_iommu_hole_init(void)
 		 */
 		u32 ctl = DISTLBWALKPRB | aper_order << 1;
 
-		bus = amd_nb_bus_dev_ranges[i].bus;
-		dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-		dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
 		for (slot = dev_base; slot < dev_limit; slot++) {
 			if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
diff --git a/trunk/arch/x86/kernel/apic/apic.c b/trunk/arch/x86/kernel/apic/apic.c
index 06c196d7e59c..a51345ba449e 100644
--- a/trunk/arch/x86/kernel/apic/apic.c
+++ b/trunk/arch/x86/kernel/apic/apic.c
@@ -684,7 +684,7 @@ static int __init calibrate_APIC_clock(void)
 	lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
 				       lapic_clockevent.shift);
 	lapic_clockevent.max_delta_ns =
-		clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
 	lapic_clockevent.min_delta_ns =
 		clockevent_delta2ns(0xF, &lapic_clockevent);
 
diff --git a/trunk/arch/x86/kernel/apic/hw_nmi.c b/trunk/arch/x86/kernel/apic/hw_nmi.c
index 79fd43ca6f96..72ec29e1ae06 100644
--- a/trunk/arch/x86/kernel/apic/hw_nmi.c
+++ b/trunk/arch/x86/kernel/apic/hw_nmi.c
@@ -68,6 +68,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
 
 	switch (cmd) {
 	case DIE_NMI:
+	case DIE_NMI_IPI:
 		break;
 
 	default:
@@ -95,7 +96,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
 static __read_mostly struct notifier_block backtrace_notifier = {
 	.notifier_call          = arch_trigger_all_cpu_backtrace_handler,
 	.next                   = NULL,
-	.priority               = NMI_LOCAL_LOW_PRIOR,
+	.priority               = 1
 };
 
 static int __init register_trigger_all_cpu_backtrace(void)
diff --git a/trunk/arch/x86/kernel/apic/x2apic_uv_x.c b/trunk/arch/x86/kernel/apic/x2apic_uv_x.c
index bd16b58b8850..ecca5f41ad2c 100644
--- a/trunk/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/trunk/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -378,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
 
 static __cpuinit void set_x2apic_extra_bits(int pnode)
 {
-	__this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
+	__this_cpu_write(x2apic_extra_bits, (pnode << 6));
 }
 
 /*
@@ -641,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
  */
 int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
-	if (reason != DIE_NMIUNKNOWN)
+	if (reason != DIE_NMI_IPI)
 		return NOTIFY_OK;
 
 	if (in_crash_kexec)
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c b/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a77971979564..e7dbde7bfedb 100644
--- a/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/trunk/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,7 +25,6 @@
 #include <linux/gfp.h>
 #include <asm/mce.h>
 #include <asm/apic.h>
-#include <asm/nmi.h>
 
 /* Update fake mce registers on current CPU. */
 static void inject_mce(struct mce *m)
@@ -84,7 +83,7 @@ static int mce_raise_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int cpu = smp_processor_id();
 	struct mce *m = &__get_cpu_var(injectm);
-	if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
+	if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
 		return NOTIFY_DONE;
 	cpumask_clear_cpu(cpu, mce_inject_cpumask);
 	if (m->inject_flags & MCJ_EXCEPTION)
@@ -96,7 +95,7 @@ static int mce_raise_notify(struct notifier_block *self,
 
 static struct notifier_block mce_raise_nb = {
 	.notifier_call = mce_raise_notify,
-	.priority = NMI_LOCAL_NORMAL_PRIOR,
+	.priority = 1000,
 };
 
 /* Inject mce on current CPU */
diff --git a/trunk/arch/x86/kernel/cpu/perf_event.c b/trunk/arch/x86/kernel/cpu/perf_event.c
index 9d977a2ea693..04921017abe0 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event.c
@@ -1267,6 +1267,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 
 	switch (cmd) {
 	case DIE_NMI:
+	case DIE_NMI_IPI:
 		break;
 	case DIE_NMIUNKNOWN:
 		this_nmi = percpu_read(irq_stat.__nmi_count);
@@ -1316,7 +1317,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 static __read_mostly struct notifier_block perf_event_nmi_notifier = {
 	.notifier_call		= perf_event_nmi_handler,
 	.next			= NULL,
-	.priority		= NMI_LOCAL_LOW_PRIOR,
+	.priority		= 1
 };
 
 static struct event_constraint unconstrained;
diff --git a/trunk/arch/x86/kernel/cpu/perf_event_p4.c b/trunk/arch/x86/kernel/cpu/perf_event_p4.c
index e56b9bfbabd1..81400b93e694 100644
--- a/trunk/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/trunk/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,21 +753,19 @@ static int p4_hw_config(struct perf_event *event)
 
 static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 {
-	u64 v;
+	int overflow = 0;
+	u32 low, high;
 
-	/* an official way for overflow indication */
-	rdmsrl(hwc->config_base + hwc->idx, v);
-	if (v & P4_CCCR_OVF) {
-		wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
-		return 1;
-	}
+	rdmsr(hwc->config_base + hwc->idx, low, high);
 
-	/* it might be unflagged overflow */
-	rdmsrl(hwc->event_base + hwc->idx, v);
-	if (!(v & ARCH_P4_CNTRVAL_MASK))
-		return 1;
+	/* we need to check high bit for unflagged overflows */
+	if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
+		overflow = 1;
+		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+			((u64)low) & ~P4_CCCR_OVF);
+	}
 
-	return 0;
+	return overflow;
 }
 
 static void p4_pmu_disable_pebs(void)
@@ -1154,9 +1152,9 @@ static __initconst const struct x86_pmu p4_pmu = {
 	 */
 	.num_counters		= ARCH_P4_MAX_CCCR,
 	.apic			= 1,
-	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
-	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
-	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+	.cntval_bits		= 40,
+	.cntval_mask		= (1ULL << 40) - 1,
+	.max_period		= (1ULL << 39) - 1,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
 	/*
diff --git a/trunk/arch/x86/kernel/dumpstack.c b/trunk/arch/x86/kernel/dumpstack.c
index d6fb146c0d8b..8474c998cbd4 100644
--- a/trunk/arch/x86/kernel/dumpstack.c
+++ b/trunk/arch/x86/kernel/dumpstack.c
@@ -197,8 +197,14 @@ void show_stack(struct task_struct *task, unsigned long *sp)
  */
 void dump_stack(void)
 {
+	unsigned long bp = 0;
 	unsigned long stack;
 
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		get_bp(bp);
+#endif
+
 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S
index d3b895f375d3..e3ba417e8697 100644
--- a/trunk/arch/x86/kernel/entry_64.S
+++ b/trunk/arch/x86/kernel/entry_64.S
@@ -299,21 +299,17 @@ ENDPROC(native_usergs_sysret64)
 ENTRY(save_args)
 	XCPT_FRAME
 	cld
-	/*
-	 * start from rbp in pt_regs and jump over
-	 * return address.
-	 */
-	movq_cfi rdi, RDI+8-RBP
-	movq_cfi rsi, RSI+8-RBP
-	movq_cfi rdx, RDX+8-RBP
-	movq_cfi rcx, RCX+8-RBP
-	movq_cfi rax, RAX+8-RBP
-	movq_cfi  r8,  R8+8-RBP
-	movq_cfi  r9,  R9+8-RBP
-	movq_cfi r10, R10+8-RBP
-	movq_cfi r11, R11+8-RBP
-
-	leaq -RBP+8(%rsp),%rdi	/* arg1 for handler */
+	movq_cfi rdi, RDI+16-ARGOFFSET
+	movq_cfi rsi, RSI+16-ARGOFFSET
+	movq_cfi rdx, RDX+16-ARGOFFSET
+	movq_cfi rcx, RCX+16-ARGOFFSET
+	movq_cfi rax, RAX+16-ARGOFFSET
+	movq_cfi  r8,  R8+16-ARGOFFSET
+	movq_cfi  r9,  R9+16-ARGOFFSET
+	movq_cfi r10, R10+16-ARGOFFSET
+	movq_cfi r11, R11+16-ARGOFFSET
+
+	leaq -ARGOFFSET+16(%rsp),%rdi	/* arg1 for handler */
 	movq_cfi rbp, 8		/* push %rbp */
 	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */
 	testl $3, CS(%rdi)
@@ -786,9 +782,8 @@ END(interrupt)
 
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
-	/* reserve pt_regs for scratch regs and rbp */
-	subq $ORIG_RAX-RBP, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
+	subq $ORIG_RAX-ARGOFFSET+8, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
 	call save_args
 	PARTIAL_FRAME 0
 	call \func
@@ -813,14 +808,9 @@ ret_from_intr:
 	TRACE_IRQS_OFF
 	decl PER_CPU_VAR(irq_count)
 	leaveq
-
 	CFI_RESTORE		rbp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	-8
-
-	/* we did not save rbx, restore only from ARGOFFSET */
-	addq $8, %rsp
-	CFI_ADJUST_CFA_OFFSET	-8
 exit_intr:
 	GET_THREAD_INFO(%rcx)
 	testl $3,CS-ARGOFFSET(%rsp)
diff --git a/trunk/arch/x86/kernel/kgdb.c b/trunk/arch/x86/kernel/kgdb.c
index a4130005028a..cd21b654dec6 100644
--- a/trunk/arch/x86/kernel/kgdb.c
+++ b/trunk/arch/x86/kernel/kgdb.c
@@ -48,7 +48,6 @@
 #include <asm/apicdef.h>
 #include <asm/system.h>
 #include <asm/apic.h>
-#include <asm/nmi.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 {
@@ -526,6 +525,10 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 		}
 		return NOTIFY_DONE;
 
+	case DIE_NMI_IPI:
+		/* Just ignore, we will handle the roundup on DIE_NMI. */
+		return NOTIFY_DONE;
+
 	case DIE_NMIUNKNOWN:
 		if (was_in_debug_nmi[raw_smp_processor_id()]) {
 			was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -603,7 +606,7 @@ static struct notifier_block kgdb_notifier = {
 	/*
 	 * Lowest-prio notifier priority, we want to be notified last:
 	 */
-	.priority	= NMI_LOCAL_LOW_PRIOR,
+	.priority	= -INT_MAX,
 };
 
 /**
diff --git a/trunk/arch/x86/kernel/reboot.c b/trunk/arch/x86/kernel/reboot.c
index fc7aae1e2bc7..c495aa8d4815 100644
--- a/trunk/arch/x86/kernel/reboot.c
+++ b/trunk/arch/x86/kernel/reboot.c
@@ -18,7 +18,6 @@
 #include <asm/pci_x86.h>
 #include <asm/virtext.h>
 #include <asm/cpu.h>
-#include <asm/nmi.h>
 
 #ifdef CONFIG_X86_32
 # include <linux/ctype.h>
@@ -748,7 +747,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 {
 	int cpu;
 
-	if (val != DIE_NMI)
+	if (val != DIE_NMI_IPI)
 		return NOTIFY_OK;
 
 	cpu = raw_smp_processor_id();
@@ -779,8 +778,6 @@ static void smp_send_nmi_allbutself(void)
 
 static struct notifier_block crash_nmi_nb = {
 	.notifier_call = crash_nmi_callback,
-	/* we want to be the first one called */
-	.priority = NMI_LOCAL_HIGH_PRIOR+1,
 };
 
 /* Halt all other CPUs, calling the specified function on each of them
diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c
index 763df77343dd..c7149c96d079 100644
--- a/trunk/arch/x86/kernel/smpboot.c
+++ b/trunk/arch/x86/kernel/smpboot.c
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
  */
 static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
 
-void cpu_hotplug_driver_lock(void)
+void cpu_hotplug_driver_lock()
 {
         mutex_lock(&x86_cpu_hotplug_driver_mutex);
 }
 
-void cpu_hotplug_driver_unlock(void)
+void cpu_hotplug_driver_unlock()
 {
         mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
diff --git a/trunk/arch/x86/kernel/traps.c b/trunk/arch/x86/kernel/traps.c
index b9b67166f9de..c76aaca5694d 100644
--- a/trunk/arch/x86/kernel/traps.c
+++ b/trunk/arch/x86/kernel/traps.c
@@ -84,11 +84,6 @@ EXPORT_SYMBOL_GPL(used_vectors);
 static int ignore_nmis;
 
 int unknown_nmi_panic;
-/*
- * Prevent NMI reason port (0x61) being accessed simultaneously, can
- * only be used in NMI handler.
- */
-static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
@@ -315,15 +310,15 @@ static int __init setup_unknown_nmi_panic(char *str)
 __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
 
 static notrace __kprobes void
-pci_serr_error(unsigned char reason, struct pt_regs *regs)
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
-	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
-		 reason, smp_processor_id());
+	printk(KERN_EMERG
+		"Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+			reason, smp_processor_id());
+
+	printk(KERN_EMERG
+		"You have some hardware problem, likely on the PCI bus.\n");
 
-	/*
-	 * On some machines, PCI SERR line is used to report memory
-	 * errors. EDAC makes use of it.
-	 */
 #if defined(CONFIG_EDAC)
 	if (edac_handler_set()) {
 		edac_atomic_assert_error();
@@ -334,11 +329,11 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 	if (panic_on_unrecovered_nmi)
 		panic("NMI: Not continuing");
 
-	pr_emerg("Dazed and confused, but trying to continue\n");
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 
-	/* Clear and disable the PCI SERR error line. */
-	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
-	outb(reason, NMI_REASON_PORT);
+	/* Clear and disable the memory parity error line. */
+	reason = (reason & 0xf) | 4;
+	outb(reason, 0x61);
 }
 
 static notrace __kprobes void
@@ -346,17 +341,15 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	unsigned long i;
 
-	pr_emerg(
-	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
-		 reason, smp_processor_id());
+	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
 	show_registers(regs);
 
 	if (panic_on_io_nmi)
 		panic("NMI IOCK error: Not continuing");
 
 	/* Re-enable the IOCK line, wait for a few seconds */
-	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
-	outb(reason, NMI_REASON_PORT);
+	reason = (reason & 0xf) | 8;
+	outb(reason, 0x61);
 
 	i = 20000;
 	while (--i) {
@@ -364,8 +357,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 		udelay(100);
 	}
 
-	reason &= ~NMI_REASON_CLEAR_IOCHK;
-	outb(reason, NMI_REASON_PORT);
+	reason &= ~8;
+	outb(reason, 0x61);
 }
 
 static notrace __kprobes void
@@ -384,50 +377,57 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 		return;
 	}
 #endif
-	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-		 reason, smp_processor_id());
+	printk(KERN_EMERG
+		"Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+			reason, smp_processor_id());
 
-	pr_emerg("Do you have a strange power saving mode enabled?\n");
+	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
 	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
 		panic("NMI: Not continuing");
 
-	pr_emerg("Dazed and confused, but trying to continue\n");
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 }
 
 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
+	int cpu;
 
-	/*
-	 * CPU-specific NMI must be processed before non-CPU-specific
-	 * NMI, otherwise we may lose it, because the CPU-specific
-	 * NMI can not be detected/processed on other CPUs.
-	 */
-	if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
+	cpu = smp_processor_id();
 
-	/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
-	raw_spin_lock(&nmi_reason_lock);
-	reason = get_nmi_reason();
+	/* Only the BSP gets external NMIs from the system. */
+	if (!cpu)
+		reason = get_nmi_reason();
 
-	if (reason & NMI_REASON_MASK) {
-		if (reason & NMI_REASON_SERR)
-			pci_serr_error(reason, regs);
-		else if (reason & NMI_REASON_IOCHK)
-			io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
-		/*
-		 * Reassert NMI in case it became active
-		 * meanwhile as it's edge-triggered:
-		 */
-		reassert_nmi();
+	if (!(reason & 0xc0)) {
+		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+								== NOTIFY_STOP)
+			return;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+		if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
 #endif
-		raw_spin_unlock(&nmi_reason_lock);
+		unknown_nmi_error(reason, regs);
+
 		return;
 	}
-	raw_spin_unlock(&nmi_reason_lock);
+	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+		return;
 
-	unknown_nmi_error(reason, regs);
+	/* AK: following checks seem to be broken on modern chipsets. FIXME */
+	if (reason & 0x80)
+		mem_parity_error(reason, regs);
+	if (reason & 0x40)
+		io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+	/*
+	 * Reassert NMI in case it became active meanwhile
+	 * as it's edge-triggered:
+	 */
+	reassert_nmi();
+#endif
 }
 
 dotraplinkage notrace __kprobes void
diff --git a/trunk/arch/x86/kernel/tsc.c b/trunk/arch/x86/kernel/tsc.c
index 823f79a17ad1..03d2ea82f35a 100644
--- a/trunk/arch/x86/kernel/tsc.c
+++ b/trunk/arch/x86/kernel/tsc.c
@@ -965,7 +965,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 
 static int __init init_tsc_clocksource(void)
 {
-	if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
+	if (!cpu_has_tsc || tsc_disabled > 0)
 		return 0;
 
 	if (tsc_clocksource_reliable)
diff --git a/trunk/arch/x86/mm/amdtopology_64.c b/trunk/arch/x86/mm/amdtopology_64.c
index f21962c435ed..08a0069b87a5 100644
--- a/trunk/arch/x86/mm/amdtopology_64.c
+++ b/trunk/arch/x86/mm/amdtopology_64.c
@@ -27,7 +27,6 @@
 #include <asm/amd_nb.h>
 
 static struct bootnode __initdata nodes[8];
-static unsigned char __initdata nodeids[8];
 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 
 static __init int find_northbridge(void)
@@ -69,6 +68,19 @@ static __init void early_get_boot_cpu_id(void)
 #endif
 }
 
+int __init amd_get_nodes(struct bootnode *physnodes)
+{
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long start = PFN_PHYS(start_pfn);
@@ -101,7 +113,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 		base = read_pci_config(0, nb, 1, 0x40 + i*8);
 		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
 
-		nodeids[i] = nodeid = limit & 7;
+		nodeid = limit & 7;
 		if ((base & 3) == 0) {
 			if (i < numnodes)
 				pr_info("Skipping disabled node %d\n", i);
@@ -181,76 +193,6 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 	return 0;
 }
 
-#ifdef CONFIG_NUMA_EMU
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-
-void __init amd_get_nodes(struct bootnode *physnodes)
-{
-	int i;
-
-	for_each_node_mask(i, nodes_parsed) {
-		physnodes[i].start = nodes[i].start;
-		physnodes[i].end = nodes[i].end;
-	}
-}
-
-static int __init find_node_by_addr(unsigned long addr)
-{
-	int ret = NUMA_NO_NODE;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		if (addr >= nodes[i].start && addr < nodes[i].end) {
-			ret = i;
-			break;
-		}
-	return ret;
-}
-
-/*
- * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
- * setup to represent the physical topology but reflect the emulated
- * environment.  For each emulated node, the real node which it appears on is
- * found and a fake pxm to nid mapping is created which mirrors the actual
- * locality.  node_distance() then represents the correct distances between
- * emulated nodes by using the fake acpi mappings to pxms.
- */
-void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
-{
-	unsigned int bits;
-	unsigned int cores;
-	unsigned int apicid_base = 0;
-	int i;
-
-	bits = boot_cpu_data.x86_coreid_bits;
-	cores = 1 << bits;
-	early_get_boot_cpu_id();
-	if (boot_cpu_physical_apicid > 0)
-		apicid_base = boot_cpu_physical_apicid;
-
-	for (i = 0; i < nr_nodes; i++) {
-		int index;
-		int nid;
-		int j;
-
-		nid = find_node_by_addr(nodes[i].start);
-		if (nid == NUMA_NO_NODE)
-			continue;
-
-		index = nodeids[nid] << bits;
-		if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
-			for (j = apicid_base; j < cores + apicid_base; j++)
-				fake_apicid_to_node[index + j] = i;
-#ifdef CONFIG_ACPI_NUMA
-		__acpi_map_pxm_to_node(nid, i);
-#endif
-	}
-	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-}
-#endif /* CONFIG_NUMA_EMU */
-
 int __init amd_scan_nodes(void)
 {
 	unsigned int bits;
diff --git a/trunk/arch/x86/mm/numa_64.c b/trunk/arch/x86/mm/numa_64.c
index 1e72102e80c9..7762a517d69d 100644
--- a/trunk/arch/x86/mm/numa_64.c
+++ b/trunk/arch/x86/mm/numa_64.c
@@ -260,30 +260,30 @@ void __init numa_init_array(void)
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
+static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
 
 static int __init setup_physnodes(unsigned long start, unsigned long end,
 					int acpi, int amd)
 {
+	int nr_nodes = 0;
 	int ret = 0;
 	int i;
 
-	memset(physnodes, 0, sizeof(physnodes));
 #ifdef CONFIG_ACPI_NUMA
 	if (acpi)
-		acpi_get_nodes(physnodes, start, end);
+		nr_nodes = acpi_get_nodes(physnodes);
 #endif
 #ifdef CONFIG_AMD_NUMA
 	if (amd)
-		amd_get_nodes(physnodes);
+		nr_nodes = amd_get_nodes(physnodes);
 #endif
 	/*
 	 * Basic sanity checking on the physical node map: there may be errors
 	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
 	 * kernel parameter is used.
 	 */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_nodes; i++) {
 		if (physnodes[i].start == physnodes[i].end)
 			continue;
 		if (physnodes[i].start > end) {
@@ -298,6 +298,17 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 			physnodes[i].start = start;
 		if (physnodes[i].end > end)
 			physnodes[i].end = end;
+	}
+
+	/*
+	 * Remove all nodes that have no memory or were truncated because of the
+	 * limited address range.
+	 */
+	for (i = 0; i < nr_nodes; i++) {
+		if (physnodes[i].start == physnodes[i].end)
+			continue;
+		physnodes[ret].start = physnodes[i].start;
+		physnodes[ret].end = physnodes[i].end;
 		ret++;
 	}
 
@@ -313,24 +324,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 	return ret;
 }
 
-static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
-{
-	int i;
-
-	BUG_ON(acpi && amd);
-#ifdef CONFIG_ACPI_NUMA
-	if (acpi)
-		acpi_fake_nodes(nodes, nr_nodes);
-#endif
-#ifdef CONFIG_AMD_NUMA
-	if (amd)
-		amd_fake_nodes(nodes, nr_nodes);
-#endif
-	if (!acpi && !amd)
-		for (i = 0; i < nr_cpu_ids; i++)
-			numa_set_node(i, 0);
-}
-
 /*
  * Setups up nid to range from addr to addr + size.  If the end
  * boundary is greater than max_addr, then max_addr is used instead.
@@ -359,7 +352,8 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  * to max_addr.  The return value is the number of nodes allocated.
  */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr,
+						int nr_phys_nodes, int nr_nodes)
 {
 	nodemask_t physnode_mask = NODE_MASK_NONE;
 	u64 size;
@@ -390,7 +384,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
 		return -1;
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_phys_nodes; i++)
 		if (physnodes[i].start != physnodes[i].end)
 			node_set(i, physnode_mask);
 
@@ -559,9 +553,11 @@ static int __init numa_emulation(unsigned long start_pfn,
 {
 	u64 addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
+	int num_phys_nodes;
 	int num_nodes;
 	int i;
 
+	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
 	/*
 	 * If the numa=fake command-line contains a 'M' or 'G', it represents
 	 * the fixed node size.  Otherwise, if it is just a single number N,
@@ -576,7 +572,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 		unsigned long n;
 
 		n = simple_strtoul(cmdline, NULL, 0);
-		num_nodes = split_nodes_interleave(addr, max_addr, n);
+		num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
 	}
 
 	if (num_nodes < 0)
@@ -599,8 +595,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 						nodes[i].end >> PAGE_SHIFT);
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
-	setup_physnodes(addr, max_addr, acpi, amd);
-	fake_physnodes(acpi, amd, num_nodes);
+	acpi_fake_nodes(nodes, num_nodes);
 	numa_init_array();
 	return 0;
 }
@@ -615,12 +610,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-			acpi, amd);
 	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
 		return;
-	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-			acpi, amd);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 #endif
@@ -776,7 +767,6 @@ void __cpuinit numa_clear_node(int cpu)
 
 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 
-#ifndef CONFIG_NUMA_EMU
 void __cpuinit numa_add_cpu(int cpu)
 {
 	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -786,115 +776,34 @@ void __cpuinit numa_remove_cpu(int cpu)
 {
 	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 }
-#else
-void __cpuinit numa_add_cpu(int cpu)
-{
-	unsigned long addr;
-	u16 apicid;
-	int physnid;
-	int nid = NUMA_NO_NODE;
-
-	apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-	if (apicid != BAD_APICID)
-		nid = apicid_to_node[apicid];
-	if (nid == NUMA_NO_NODE)
-		nid = early_cpu_to_node(cpu);
-	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
-
-	/*
-	 * Use the starting address of the emulated node to find which physical
-	 * node it is allocated on.
-	 */
-	addr = node_start_pfn(nid) << PAGE_SHIFT;
-	for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
-		if (addr >= physnodes[physnid].start &&
-		    addr < physnodes[physnid].end)
-			break;
-
-	/*
-	 * Map the cpu to each emulated node that is allocated on the physical
-	 * node of the cpu's apic id.
-	 */
-	for_each_online_node(nid) {
-		addr = node_start_pfn(nid) << PAGE_SHIFT;
-		if (addr >= physnodes[physnid].start &&
-		    addr < physnodes[physnid].end)
-			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-	}
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	int i;
-
-	for_each_online_node(i)
-		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
-}
-#endif /* !CONFIG_NUMA_EMU */
 
 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
-static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
-{
-	int node = early_cpu_to_node(cpu);
-	struct cpumask *mask;
-	char buf[64];
-
-	mask = node_to_cpumask_map[node];
-	if (!mask) {
-		pr_err("node_to_cpumask_map[%i] NULL\n", node);
-		dump_stack();
-		return NULL;
-	}
-
-	cpulist_scnprintf(buf, sizeof(buf), mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-		enable ? "numa_add_cpu" : "numa_remove_cpu",
-		cpu, node, buf);
-	return mask;
-}
 
 /*
  * --------- debug versions of the numa functions ---------
  */
-#ifndef CONFIG_NUMA_EMU
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
+	int node = early_cpu_to_node(cpu);
 	struct cpumask *mask;
+	char buf[64];
 
-	mask = debug_cpumask_set_cpu(cpu, enable);
-	if (!mask)
+	mask = node_to_cpumask_map[node];
+	if (mask == NULL) {
+		printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
+		dump_stack();
 		return;
+	}
 
 	if (enable)
 		cpumask_set_cpu(cpu, mask);
 	else
 		cpumask_clear_cpu(cpu, mask);
-}
-#else
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-	int node = early_cpu_to_node(cpu);
-	struct cpumask *mask;
-	int i;
 
-	for_each_online_node(i) {
-		unsigned long addr;
-
-		addr = node_start_pfn(i) << PAGE_SHIFT;
-		if (addr < physnodes[node].start ||
-					addr >= physnodes[node].end)
-			continue;
-		mask = debug_cpumask_set_cpu(cpu, enable);
-		if (!mask)
-			return;
-
-		if (enable)
-			cpumask_set_cpu(cpu, mask);
-		else
-			cpumask_clear_cpu(cpu, mask);
-	}
+	cpulist_scnprintf(buf, sizeof(buf), mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
 }
-#endif /* CONFIG_NUMA_EMU */
 
 void __cpuinit numa_add_cpu(int cpu)
 {
diff --git a/trunk/arch/x86/mm/srat_64.c b/trunk/arch/x86/mm/srat_64.c
index 603d285d1daa..171a0aacb99a 100644
--- a/trunk/arch/x86/mm/srat_64.c
+++ b/trunk/arch/x86/mm/srat_64.c
@@ -349,19 +349,18 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 
 void __init acpi_numa_arch_fixup(void) {}
 
-#ifdef CONFIG_NUMA_EMU
-void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
-				unsigned long end)
+int __init acpi_get_nodes(struct bootnode *physnodes)
 {
 	int i;
+	int ret = 0;
 
 	for_each_node_mask(i, nodes_parsed) {
-		cutoff_node(i, start, end);
-		physnodes[i].start = nodes[i].start;
-		physnodes[i].end = nodes[i].end;
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
 	}
+	return ret;
 }
-#endif /* CONFIG_NUMA_EMU */
 
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -506,6 +505,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
 	int i, j;
 
+	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+			 "topology.\n");
 	for (i = 0; i < num_nodes; i++) {
 		int nid, pxm;
 
@@ -525,17 +526,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 			    fake_apicid_to_node[j] == NUMA_NO_NODE)
 				fake_apicid_to_node[j] = i;
 	}
-
-	/*
-	 * If there are apicid-to-node mappings for physical nodes that do not
-	 * have a corresponding emulated node, it should default to a guaranteed
-	 * value.
-	 */
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		if (apicid_to_node[i] != NUMA_NO_NODE &&
-		    fake_apicid_to_node[i] == NUMA_NO_NODE)
-			fake_apicid_to_node[i] = 0;
-
 	for (i = 0; i < num_nodes; i++)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/trunk/arch/x86/oprofile/nmi_int.c b/trunk/arch/x86/oprofile/nmi_int.c
index e2b7b0c06cdf..f24a8533bcdf 100644
--- a/trunk/arch/x86/oprofile/nmi_int.c
+++ b/trunk/arch/x86/oprofile/nmi_int.c
@@ -65,6 +65,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
 
 	switch (val) {
 	case DIE_NMI:
+	case DIE_NMI_IPI:
 		if (ctr_running)
 			model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
 		else if (!nmi_enabled)
@@ -360,7 +361,7 @@ static void nmi_cpu_setup(void *dummy)
 static struct notifier_block profile_exceptions_nb = {
 	.notifier_call = profile_exceptions_notify,
 	.next = NULL,
-	.priority = NMI_LOCAL_LOW_PRIOR,
+	.priority = 2
 };
 
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
diff --git a/trunk/arch/x86/oprofile/nmi_timer_int.c b/trunk/arch/x86/oprofile/nmi_timer_int.c
index 720bf5a53c51..0636dd93cef8 100644
--- a/trunk/arch/x86/oprofile/nmi_timer_int.c
+++ b/trunk/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
 static struct notifier_block profile_timer_exceptions_nb = {
 	.notifier_call = profile_timer_exceptions_notify,
 	.next = NULL,
-	.priority = NMI_LOW_PRIOR,
+	.priority = 0
 };
 
 static int timer_start(void)
diff --git a/trunk/arch/x86/pci/amd_bus.c b/trunk/arch/x86/pci/amd_bus.c
index e27dffbbb1a7..fc1e8fe07e5c 100644
--- a/trunk/arch/x86/pci/amd_bus.c
+++ b/trunk/arch/x86/pci/amd_bus.c
@@ -4,7 +4,6 @@
 #include <linux/cpu.h>
 #include <linux/range.h>
 
-#include <asm/amd_nb.h>
 #include <asm/pci_x86.h>
 
 #include <asm/pci-direct.h>
@@ -379,34 +378,6 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
 	.notifier_call	= amd_cpu_notify,
 };
 
-static void __init pci_enable_pci_io_ecs(void)
-{
-#ifdef CONFIG_AMD_NB
-	unsigned int i, n;
-
-	for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
-		u8 bus = amd_nb_bus_dev_ranges[i].bus;
-		u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
-		u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
-
-		for (; slot < limit; ++slot) {
-			u32 val = read_pci_config(bus, slot, 3, 0);
-
-			if (!early_is_amd_nb(val))
-				continue;
-
-			val = read_pci_config(bus, slot, 3, 0x8c);
-			if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
-				val |= ENABLE_CF8_EXT_CFG >> 32;
-				write_pci_config(bus, slot, 3, 0x8c, val);
-			}
-			++n;
-		}
-	}
-	pr_info("Extended Config Space enabled on %u nodes\n", n);
-#endif
-}
-
 static int __init pci_io_ecs_init(void)
 {
 	int cpu;
@@ -415,10 +386,6 @@ static int __init pci_io_ecs_init(void)
         if (boot_cpu_data.x86 < 0x10)
 		return 0;
 
-	/* Try the PCI method first. */
-	if (early_pci_allowed())
-		pci_enable_pci_io_ecs();
-
 	register_cpu_notifier(&amd_cpu_notifier);
 	for_each_online_cpu(cpu)
 		amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/trunk/drivers/char/ipmi/ipmi_watchdog.c b/trunk/drivers/char/ipmi/ipmi_watchdog.c
index 320668f4c3aa..f4d334f2536e 100644
--- a/trunk/drivers/char/ipmi/ipmi_watchdog.c
+++ b/trunk/drivers/char/ipmi/ipmi_watchdog.c
@@ -1081,7 +1081,7 @@ ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
 {
 	struct die_args *args = data;
 
-	if (val != DIE_NMIUNKNOWN)
+	if (val != DIE_NMI)
 		return NOTIFY_OK;
 
 	/* Hack, if it's a memory or I/O error, ignore it. */
diff --git a/trunk/drivers/mfd/sh_mobile_sdhi.c b/trunk/drivers/mfd/sh_mobile_sdhi.c
index 0a7df44a93c0..f1714f93af9d 100644
--- a/trunk/drivers/mfd/sh_mobile_sdhi.c
+++ b/trunk/drivers/mfd/sh_mobile_sdhi.c
@@ -131,17 +131,11 @@ static int __devinit sh_mobile_sdhi_probe(struct platform_device *pdev)
 	 */
 	mmc_data->flags |= TMIO_MMC_BLKSZ_2BYTES;
 
-	/*
-	 * All SDHI blocks support SDIO IRQ signalling.
-	 */
-	mmc_data->flags |= TMIO_MMC_SDIO_IRQ;
-
 	if (p && p->dma_slave_tx >= 0 && p->dma_slave_rx >= 0) {
 		priv->param_tx.slave_id = p->dma_slave_tx;
 		priv->param_rx.slave_id = p->dma_slave_rx;
 		priv->dma_priv.chan_priv_tx = &priv->param_tx;
 		priv->dma_priv.chan_priv_rx = &priv->param_rx;
-		priv->dma_priv.alignment_shift = 1; /* 2-byte alignment */
 		mmc_data->dma = &priv->dma_priv;
 	}
 
diff --git a/trunk/drivers/mmc/card/Kconfig b/trunk/drivers/mmc/card/Kconfig
index 2a876c4099cd..57e4416b9ef0 100644
--- a/trunk/drivers/mmc/card/Kconfig
+++ b/trunk/drivers/mmc/card/Kconfig
@@ -16,7 +16,6 @@ config MMC_BLOCK
 
 config MMC_BLOCK_MINORS
 	int "Number of minors per block device"
-	depends on MMC_BLOCK
 	range 4 256
 	default 8
 	help
diff --git a/trunk/drivers/mmc/core/Kconfig b/trunk/drivers/mmc/core/Kconfig
index ef103871517f..bb22ffd76ef8 100644
--- a/trunk/drivers/mmc/core/Kconfig
+++ b/trunk/drivers/mmc/core/Kconfig
@@ -16,14 +16,3 @@ config MMC_UNSAFE_RESUME
 
 	  This option sets a default which can be overridden by the
 	  module parameter "removable=0" or "removable=1".
-
-config MMC_CLKGATE
-	bool "MMC host clock gating (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This will attempt to aggressively gate the clock to the MMC card.
-	  This is done to save power due to gating off the logic and bus
-	  noise when the MMC card is not in use. Your host driver has to
-	  support handling this in order for it to be of any use.
-
-	  If unsure, say N.
diff --git a/trunk/drivers/mmc/core/bus.c b/trunk/drivers/mmc/core/bus.c
index 63667a8f140c..af8dc6a2a317 100644
--- a/trunk/drivers/mmc/core/bus.c
+++ b/trunk/drivers/mmc/core/bus.c
@@ -303,14 +303,14 @@ int mmc_add_card(struct mmc_card *card)
 			type, card->rca);
 	}
 
-#ifdef CONFIG_DEBUG_FS
-	mmc_add_card_debugfs(card);
-#endif
-
 	ret = device_add(&card->dev);
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_DEBUG_FS
+	mmc_add_card_debugfs(card);
+#endif
+
 	mmc_card_set_present(card);
 
 	return 0;
diff --git a/trunk/drivers/mmc/core/core.c b/trunk/drivers/mmc/core/core.c
index 6625c057be05..a3a780faf85a 100644
--- a/trunk/drivers/mmc/core/core.c
+++ b/trunk/drivers/mmc/core/core.c
@@ -22,7 +22,6 @@
 #include <linux/scatterlist.h>
 #include <linux/log2.h>
 #include <linux/regulator/consumer.h>
-#include <linux/pm_runtime.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -131,8 +130,6 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 
 		if (mrq->done)
 			mrq->done(mrq);
-
-		mmc_host_clk_gate(host);
 	}
 }
 
@@ -193,7 +190,6 @@ mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 			mrq->stop->mrq = mrq;
 		}
 	}
-	mmc_host_clk_ungate(host);
 	host->ops->request(host, mrq);
 }
 
@@ -299,9 +295,8 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card)
 		unsigned int timeout_us, limit_us;
 
 		timeout_us = data->timeout_ns / 1000;
-		if (mmc_host_clk_rate(card->host))
-			timeout_us += data->timeout_clks * 1000 /
-				(mmc_host_clk_rate(card->host) / 1000);
+		timeout_us += data->timeout_clks * 1000 /
+			(card->host->ios.clock / 1000);
 
 		if (data->flags & MMC_DATA_WRITE)
 			/*
@@ -619,8 +614,6 @@ static inline void mmc_set_ios(struct mmc_host *host)
 		 ios->power_mode, ios->chip_select, ios->vdd,
 		 ios->bus_width, ios->timing);
 
-	if (ios->clock > 0)
-		mmc_set_ungated(host);
 	host->ops->set_ios(host, ios);
 }
 
@@ -648,61 +641,6 @@ void mmc_set_clock(struct mmc_host *host, unsigned int hz)
 	mmc_set_ios(host);
 }
 
-#ifdef CONFIG_MMC_CLKGATE
-/*
- * This gates the clock by setting it to 0 Hz.
- */
-void mmc_gate_clock(struct mmc_host *host)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->clk_lock, flags);
-	host->clk_old = host->ios.clock;
-	host->ios.clock = 0;
-	host->clk_gated = true;
-	spin_unlock_irqrestore(&host->clk_lock, flags);
-	mmc_set_ios(host);
-}
-
-/*
- * This restores the clock from gating by using the cached
- * clock value.
- */
-void mmc_ungate_clock(struct mmc_host *host)
-{
-	/*
-	 * We should previously have gated the clock, so the clock shall
-	 * be 0 here! The clock may however be 0 during initialization,
-	 * when some request operations are performed before setting
-	 * the frequency. When ungate is requested in that situation
-	 * we just ignore the call.
-	 */
-	if (host->clk_old) {
-		BUG_ON(host->ios.clock);
-		/* This call will also set host->clk_gated to false */
-		mmc_set_clock(host, host->clk_old);
-	}
-}
-
-void mmc_set_ungated(struct mmc_host *host)
-{
-	unsigned long flags;
-
-	/*
-	 * We've been given a new frequency while the clock is gated,
-	 * so make sure we regard this as ungating it.
-	 */
-	spin_lock_irqsave(&host->clk_lock, flags);
-	host->clk_gated = false;
-	spin_unlock_irqrestore(&host->clk_lock, flags);
-}
-
-#else
-void mmc_set_ungated(struct mmc_host *host)
-{
-}
-#endif
-
 /*
  * Change the bus mode (open drain/push-pull) of a host.
  */
@@ -1486,57 +1424,35 @@ int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen)
 }
 EXPORT_SYMBOL(mmc_set_blocklen);
 
-static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
-{
-	host->f_init = freq;
-
-#ifdef CONFIG_MMC_DEBUG
-	pr_info("%s: %s: trying to init card at %u Hz\n",
-		mmc_hostname(host), __func__, host->f_init);
-#endif
-	mmc_power_up(host);
-	sdio_reset(host);
-	mmc_go_idle(host);
-
-	mmc_send_if_cond(host, host->ocr_avail);
-
-	/* Order's important: probe SDIO, then SD, then MMC */
-	if (!mmc_attach_sdio(host))
-		return 0;
-	if (!mmc_attach_sd(host))
-		return 0;
-	if (!mmc_attach_mmc(host))
-		return 0;
-
-	mmc_power_off(host);
-	return -EIO;
-}
-
 void mmc_rescan(struct work_struct *work)
 {
-	static const unsigned freqs[] = { 400000, 300000, 200000, 100000 };
 	struct mmc_host *host =
 		container_of(work, struct mmc_host, detect.work);
+	u32 ocr;
+	int err;
+	unsigned long flags;
 	int i;
+	const unsigned freqs[] = { 400000, 300000, 200000, 100000 };
+
+	spin_lock_irqsave(&host->lock, flags);
 
-	if (host->rescan_disable)
+	if (host->rescan_disable) {
+		spin_unlock_irqrestore(&host->lock, flags);
 		return;
+	}
+
+	spin_unlock_irqrestore(&host->lock, flags);
+
 
 	mmc_bus_get(host);
 
-	/*
-	 * if there is a _removable_ card registered, check whether it is
-	 * still present
-	 */
-	if (host->bus_ops && host->bus_ops->detect && !host->bus_dead
-	    && mmc_card_is_removable(host))
+	/* if there is a card registered, check whether it is still present */
+	if ((host->bus_ops != NULL) && host->bus_ops->detect && !host->bus_dead)
 		host->bus_ops->detect(host);
 
-	/*
-	 * Let mmc_bus_put() free the bus/bus_ops if we've found that
-	 * the card is no longer present.
-	 */
 	mmc_bus_put(host);
+
+
 	mmc_bus_get(host);
 
 	/* if there still is a card present, stop here */
@@ -1545,6 +1461,8 @@ void mmc_rescan(struct work_struct *work)
 		goto out;
 	}
 
+	/* detect a newly inserted card */
+
 	/*
 	 * Only we can add a new handler, so it's safe to
 	 * release the lock here.
@@ -1554,16 +1472,72 @@ void mmc_rescan(struct work_struct *work)
 	if (host->ops->get_cd && host->ops->get_cd(host) == 0)
 		goto out;
 
-	mmc_claim_host(host);
 	for (i = 0; i < ARRAY_SIZE(freqs); i++) {
-		if (!mmc_rescan_try_freq(host, max(freqs[i], host->f_min)))
-			break;
-		if (freqs[i] < host->f_min)
-			break;
-	}
-	mmc_release_host(host);
+		mmc_claim_host(host);
+
+		if (freqs[i] >= host->f_min)
+			host->f_init = freqs[i];
+		else if (!i || freqs[i-1] > host->f_min)
+			host->f_init = host->f_min;
+		else {
+			mmc_release_host(host);
+			goto out;
+		}
+#ifdef CONFIG_MMC_DEBUG
+		pr_info("%s: %s: trying to init card at %u Hz\n",
+			mmc_hostname(host), __func__, host->f_init);
+#endif
+		mmc_power_up(host);
+		sdio_reset(host);
+		mmc_go_idle(host);
+
+		mmc_send_if_cond(host, host->ocr_avail);
 
- out:
+		/*
+		 * First we search for SDIO...
+		 */
+		err = mmc_send_io_op_cond(host, 0, &ocr);
+		if (!err) {
+			if (mmc_attach_sdio(host, ocr)) {
+				mmc_claim_host(host);
+				/*
+				 * Try SDMEM (but not MMC) even if SDIO
+				 * is broken.
+				 */
+				if (mmc_send_app_op_cond(host, 0, &ocr))
+					goto out_fail;
+
+				if (mmc_attach_sd(host, ocr))
+					mmc_power_off(host);
+			}
+			goto out;
+		}
+
+		/*
+		 * ...then normal SD...
+		 */
+		err = mmc_send_app_op_cond(host, 0, &ocr);
+		if (!err) {
+			if (mmc_attach_sd(host, ocr))
+				mmc_power_off(host);
+			goto out;
+		}
+
+		/*
+		 * ...and finally MMC.
+		 */
+		err = mmc_send_op_cond(host, 0, &ocr);
+		if (!err) {
+			if (mmc_attach_mmc(host, ocr))
+				mmc_power_off(host);
+			goto out;
+		}
+
+out_fail:
+		mmc_release_host(host);
+		mmc_power_off(host);
+	}
+out:
 	if (host->caps & MMC_CAP_NEEDS_POLL)
 		mmc_schedule_delayed_work(&host->detect, HZ);
 }
@@ -1747,18 +1721,6 @@ int mmc_resume_host(struct mmc_host *host)
 		if (!(host->pm_flags & MMC_PM_KEEP_POWER)) {
 			mmc_power_up(host);
 			mmc_select_voltage(host, host->ocr);
-			/*
-			 * Tell runtime PM core we just powered up the card,
-			 * since it still believes the card is powered off.
-			 * Note that currently runtime PM is only enabled
-			 * for SDIO cards that are MMC_CAP_POWER_OFF_CARD
-			 */
-			if (mmc_card_sdio(host->card) &&
-			    (host->caps & MMC_CAP_POWER_OFF_CARD)) {
-				pm_runtime_disable(&host->card->dev);
-				pm_runtime_set_active(&host->card->dev);
-				pm_runtime_enable(&host->card->dev);
-			}
 		}
 		BUG_ON(!host->bus_ops->resume);
 		err = host->bus_ops->resume(host);
diff --git a/trunk/drivers/mmc/core/core.h b/trunk/drivers/mmc/core/core.h
index ca1fdde29df6..77240cd11bcf 100644
--- a/trunk/drivers/mmc/core/core.h
+++ b/trunk/drivers/mmc/core/core.h
@@ -33,9 +33,6 @@ void mmc_init_erase(struct mmc_card *card);
 
 void mmc_set_chip_select(struct mmc_host *host, int mode);
 void mmc_set_clock(struct mmc_host *host, unsigned int hz);
-void mmc_gate_clock(struct mmc_host *host);
-void mmc_ungate_clock(struct mmc_host *host);
-void mmc_set_ungated(struct mmc_host *host);
 void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode);
 void mmc_set_bus_width(struct mmc_host *host, unsigned int width);
 void mmc_set_bus_width_ddr(struct mmc_host *host, unsigned int width,
@@ -57,9 +54,9 @@ void mmc_rescan(struct work_struct *work);
 void mmc_start_host(struct mmc_host *host);
 void mmc_stop_host(struct mmc_host *host);
 
-int mmc_attach_mmc(struct mmc_host *host);
-int mmc_attach_sd(struct mmc_host *host);
-int mmc_attach_sdio(struct mmc_host *host);
+int mmc_attach_mmc(struct mmc_host *host, u32 ocr);
+int mmc_attach_sd(struct mmc_host *host, u32 ocr);
+int mmc_attach_sdio(struct mmc_host *host, u32 ocr);
 
 /* Module parameters */
 extern int use_spi_crc;
diff --git a/trunk/drivers/mmc/core/debugfs.c b/trunk/drivers/mmc/core/debugfs.c
index 998797ed67a6..eed1405fd742 100644
--- a/trunk/drivers/mmc/core/debugfs.c
+++ b/trunk/drivers/mmc/core/debugfs.c
@@ -183,11 +183,6 @@ void mmc_add_host_debugfs(struct mmc_host *host)
 			&mmc_clock_fops))
 		goto err_node;
 
-#ifdef CONFIG_MMC_CLKGATE
-	if (!debugfs_create_u32("clk_delay", (S_IRUSR | S_IWUSR),
-				root, &host->clk_delay))
-		goto err_node;
-#endif
 	return;
 
 err_node:
diff --git a/trunk/drivers/mmc/core/host.c b/trunk/drivers/mmc/core/host.c
index b3ac6c5bc5c6..10b8af27e03a 100644
--- a/trunk/drivers/mmc/core/host.c
+++ b/trunk/drivers/mmc/core/host.c
@@ -3,7 +3,6 @@
  *
  *  Copyright (C) 2003 Russell King, All Rights Reserved.
  *  Copyright (C) 2007-2008 Pierre Ossman
- *  Copyright (C) 2010 Linus Walleij
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,7 +20,6 @@
 #include <linux/suspend.h>
 
 #include <linux/mmc/host.h>
-#include <linux/mmc/card.h>
 
 #include "core.h"
 #include "host.h"
@@ -52,205 +50,6 @@ void mmc_unregister_host_class(void)
 static DEFINE_IDR(mmc_host_idr);
 static DEFINE_SPINLOCK(mmc_host_lock);
 
-#ifdef CONFIG_MMC_CLKGATE
-
-/*
- * Enabling clock gating will make the core call out to the host
- * once up and once down when it performs a request or card operation
- * intermingled in any fashion. The driver will see this through
- * set_ios() operations with ios.clock field set to 0 to gate (disable)
- * the block clock, and to the old frequency to enable it again.
- */
-static void mmc_host_clk_gate_delayed(struct mmc_host *host)
-{
-	unsigned long tick_ns;
-	unsigned long freq = host->ios.clock;
-	unsigned long flags;
-
-	if (!freq) {
-		pr_debug("%s: frequency set to 0 in disable function, "
-			 "this means the clock is already disabled.\n",
-			 mmc_hostname(host));
-		return;
-	}
-	/*
-	 * New requests may have appeared while we were scheduling,
-	 * then there is no reason to delay the check before
-	 * clk_disable().
-	 */
-	spin_lock_irqsave(&host->clk_lock, flags);
-
-	/*
-	 * Delay n bus cycles (at least 8 from MMC spec) before attempting
-	 * to disable the MCI block clock. The reference count may have
-	 * gone up again after this delay due to rescheduling!
-	 */
-	if (!host->clk_requests) {
-		spin_unlock_irqrestore(&host->clk_lock, flags);
-		tick_ns = DIV_ROUND_UP(1000000000, freq);
-		ndelay(host->clk_delay * tick_ns);
-	} else {
-		/* New users appeared while waiting for this work */
-		spin_unlock_irqrestore(&host->clk_lock, flags);
-		return;
-	}
-	mutex_lock(&host->clk_gate_mutex);
-	spin_lock_irqsave(&host->clk_lock, flags);
-	if (!host->clk_requests) {
-		spin_unlock_irqrestore(&host->clk_lock, flags);
-		/* This will set host->ios.clock to 0 */
-		mmc_gate_clock(host);
-		spin_lock_irqsave(&host->clk_lock, flags);
-		pr_debug("%s: gated MCI clock\n", mmc_hostname(host));
-	}
-	spin_unlock_irqrestore(&host->clk_lock, flags);
-	mutex_unlock(&host->clk_gate_mutex);
-}
-
-/*
- * Internal work. Work to disable the clock at some later point.
- */
-static void mmc_host_clk_gate_work(struct work_struct *work)
-{
-	struct mmc_host *host = container_of(work, struct mmc_host,
-					      clk_gate_work);
-
-	mmc_host_clk_gate_delayed(host);
-}
-
-/**
- *	mmc_host_clk_ungate - ungate hardware MCI clocks
- *	@host: host to ungate.
- *
- *	Makes sure the host ios.clock is restored to a non-zero value
- *	past this call.	Increase clock reference count and ungate clock
- *	if we're the first user.
- */
-void mmc_host_clk_ungate(struct mmc_host *host)
-{
-	unsigned long flags;
-
-	mutex_lock(&host->clk_gate_mutex);
-	spin_lock_irqsave(&host->clk_lock, flags);
-	if (host->clk_gated) {
-		spin_unlock_irqrestore(&host->clk_lock, flags);
-		mmc_ungate_clock(host);
-		spin_lock_irqsave(&host->clk_lock, flags);
-		pr_debug("%s: ungated MCI clock\n", mmc_hostname(host));
-	}
-	host->clk_requests++;
-	spin_unlock_irqrestore(&host->clk_lock, flags);
-	mutex_unlock(&host->clk_gate_mutex);
-}
-
-/**
- *	mmc_host_may_gate_card - check if this card may be gated
- *	@card: card to check.
- */
-static bool mmc_host_may_gate_card(struct mmc_card *card)
-{
-	/* If there is no card we may gate it */
-	if (!card)
-		return true;
-	/*
-	 * Don't gate SDIO cards! These need to be clocked at all times
-	 * since they may be independent systems generating interrupts
-	 * and other events. The clock requests counter from the core will
-	 * go down to zero since the core does not need it, but we will not
-	 * gate the clock, because there is somebody out there that may still
-	 * be using it.
-	 */
-	if (mmc_card_sdio(card))
-		return false;
-
-	return true;
-}
-
-/**
- *	mmc_host_clk_gate - gate off hardware MCI clocks
- *	@host: host to gate.
- *
- *	Calls the host driver with ios.clock set to zero as often as possible
- *	in order to gate off hardware MCI clocks. Decrease clock reference
- *	count and schedule disabling of clock.
- */
-void mmc_host_clk_gate(struct mmc_host *host)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->clk_lock, flags);
-	host->clk_requests--;
-	if (mmc_host_may_gate_card(host->card) &&
-	    !host->clk_requests)
-		schedule_work(&host->clk_gate_work);
-	spin_unlock_irqrestore(&host->clk_lock, flags);
-}
-
-/**
- *	mmc_host_clk_rate - get current clock frequency setting
- *	@host: host to get the clock frequency for.
- *
- *	Returns current clock frequency regardless of gating.
- */
-unsigned int mmc_host_clk_rate(struct mmc_host *host)
-{
-	unsigned long freq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->clk_lock, flags);
-	if (host->clk_gated)
-		freq = host->clk_old;
-	else
-		freq = host->ios.clock;
-	spin_unlock_irqrestore(&host->clk_lock, flags);
-	return freq;
-}
-
-/**
- *	mmc_host_clk_init - set up clock gating code
- *	@host: host with potential clock to control
- */
-static inline void mmc_host_clk_init(struct mmc_host *host)
-{
-	host->clk_requests = 0;
-	/* Hold MCI clock for 8 cycles by default */
-	host->clk_delay = 8;
-	host->clk_gated = false;
-	INIT_WORK(&host->clk_gate_work, mmc_host_clk_gate_work);
-	spin_lock_init(&host->clk_lock);
-	mutex_init(&host->clk_gate_mutex);
-}
-
-/**
- *	mmc_host_clk_exit - shut down clock gating code
- *	@host: host with potential clock to control
- */
-static inline void mmc_host_clk_exit(struct mmc_host *host)
-{
-	/*
-	 * Wait for any outstanding gate and then make sure we're
-	 * ungated before exiting.
-	 */
-	if (cancel_work_sync(&host->clk_gate_work))
-		mmc_host_clk_gate_delayed(host);
-	if (host->clk_gated)
-		mmc_host_clk_ungate(host);
-	/* There should be only one user now */
-	WARN_ON(host->clk_requests > 1);
-}
-
-#else
-
-static inline void mmc_host_clk_init(struct mmc_host *host)
-{
-}
-
-static inline void mmc_host_clk_exit(struct mmc_host *host)
-{
-}
-
-#endif
-
 /**
  *	mmc_alloc_host - initialise the per-host structure.
  *	@extra: sizeof private data structure
@@ -283,8 +82,6 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->class_dev.class = &mmc_host_class;
 	device_initialize(&host->class_dev);
 
-	mmc_host_clk_init(host);
-
 	spin_lock_init(&host->lock);
 	init_waitqueue_head(&host->wq);
 	INIT_DELAYED_WORK(&host->detect, mmc_rescan);
@@ -366,8 +163,6 @@ void mmc_remove_host(struct mmc_host *host)
 	device_del(&host->class_dev);
 
 	led_trigger_unregister_simple(host->led);
-
-	mmc_host_clk_exit(host);
 }
 
 EXPORT_SYMBOL(mmc_remove_host);
@@ -388,3 +183,4 @@ void mmc_free_host(struct mmc_host *host)
 }
 
 EXPORT_SYMBOL(mmc_free_host);
+
diff --git a/trunk/drivers/mmc/core/host.h b/trunk/drivers/mmc/core/host.h
index de199f911928..8c87e1109a34 100644
--- a/trunk/drivers/mmc/core/host.h
+++ b/trunk/drivers/mmc/core/host.h
@@ -10,31 +10,10 @@
  */
 #ifndef _MMC_CORE_HOST_H
 #define _MMC_CORE_HOST_H
-#include <linux/mmc/host.h>
 
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
-#ifdef CONFIG_MMC_CLKGATE
-void mmc_host_clk_ungate(struct mmc_host *host);
-void mmc_host_clk_gate(struct mmc_host *host);
-unsigned int mmc_host_clk_rate(struct mmc_host *host);
-
-#else
-static inline void mmc_host_clk_ungate(struct mmc_host *host)
-{
-}
-
-static inline void mmc_host_clk_gate(struct mmc_host *host)
-{
-}
-
-static inline unsigned int mmc_host_clk_rate(struct mmc_host *host)
-{
-	return host->ios.clock;
-}
-#endif
-
 void mmc_host_deeper_disable(struct work_struct *work);
 
 #endif
diff --git a/trunk/drivers/mmc/core/mmc.c b/trunk/drivers/mmc/core/mmc.c
index 16006ef153fe..77f93c3b8808 100644
--- a/trunk/drivers/mmc/core/mmc.c
+++ b/trunk/drivers/mmc/core/mmc.c
@@ -534,57 +534,39 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	 */
 	if ((card->csd.mmca_vsn >= CSD_SPEC_VER_4) &&
 	    (host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) {
-		static unsigned ext_csd_bits[][2] = {
-			{ EXT_CSD_BUS_WIDTH_8, EXT_CSD_DDR_BUS_WIDTH_8 },
-			{ EXT_CSD_BUS_WIDTH_4, EXT_CSD_DDR_BUS_WIDTH_4 },
-			{ EXT_CSD_BUS_WIDTH_1, EXT_CSD_BUS_WIDTH_1 },
-		};
-		static unsigned bus_widths[] = {
-			MMC_BUS_WIDTH_8,
-			MMC_BUS_WIDTH_4,
-			MMC_BUS_WIDTH_1
-		};
-		unsigned idx, bus_width = 0;
-
-		if (host->caps & MMC_CAP_8_BIT_DATA)
-			idx = 0;
-		else
-			idx = 1;
-		for (; idx < ARRAY_SIZE(bus_widths); idx++) {
-			bus_width = bus_widths[idx];
-			if (bus_width == MMC_BUS_WIDTH_1)
-				ddr = 0; /* no DDR for 1-bit width */
-			err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-					 EXT_CSD_BUS_WIDTH,
-					 ext_csd_bits[idx][0]);
-			if (!err) {
-				mmc_set_bus_width_ddr(card->host,
-						      bus_width, MMC_SDR_MODE);
-				/*
-				 * If controller can't handle bus width test,
-				 * use the highest bus width to maintain
-				 * compatibility with previous MMC behavior.
-				 */
-				if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
-					break;
-				err = mmc_bus_test(card, bus_width);
-				if (!err)
-					break;
-			}
+		unsigned ext_csd_bit, bus_width;
+
+		if (host->caps & MMC_CAP_8_BIT_DATA) {
+			if (ddr)
+				ext_csd_bit = EXT_CSD_DDR_BUS_WIDTH_8;
+			else
+				ext_csd_bit = EXT_CSD_BUS_WIDTH_8;
+			bus_width = MMC_BUS_WIDTH_8;
+		} else {
+			if (ddr)
+				ext_csd_bit = EXT_CSD_DDR_BUS_WIDTH_4;
+			else
+				ext_csd_bit = EXT_CSD_BUS_WIDTH_4;
+			bus_width = MMC_BUS_WIDTH_4;
 		}
 
-		if (!err && ddr) {
-			err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-					EXT_CSD_BUS_WIDTH,
-					ext_csd_bits[idx][1]);
-		}
+		err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+				 EXT_CSD_BUS_WIDTH, ext_csd_bit);
+
+		if (err && err != -EBADMSG)
+			goto free_card;
+
 		if (err) {
 			printk(KERN_WARNING "%s: switch to bus width %d ddr %d "
-				"failed\n", mmc_hostname(card->host),
-				1 << bus_width, ddr);
-			goto free_card;
-		} else if (ddr) {
-			mmc_card_set_ddr_mode(card);
+			       "failed\n", mmc_hostname(card->host),
+			       1 << bus_width, ddr);
+			err = 0;
+		} else {
+			if (ddr)
+				mmc_card_set_ddr_mode(card);
+			else
+				ddr = MMC_SDR_MODE;
+
 			mmc_set_bus_width_ddr(card->host, bus_width, ddr);
 		}
 	}
@@ -755,21 +737,14 @@ static void mmc_attach_bus_ops(struct mmc_host *host)
 /*
  * Starting point for MMC card init.
  */
-int mmc_attach_mmc(struct mmc_host *host)
+int mmc_attach_mmc(struct mmc_host *host, u32 ocr)
 {
 	int err;
-	u32 ocr;
 
 	BUG_ON(!host);
 	WARN_ON(!host->claimed);
 
-	err = mmc_send_op_cond(host, 0, &ocr);
-	if (err)
-		return err;
-
 	mmc_attach_bus_ops(host);
-	if (host->ocr_avail_mmc)
-		host->ocr_avail = host->ocr_avail_mmc;
 
 	/*
 	 * We need to get OCR a different way for SPI.
@@ -809,20 +784,20 @@ int mmc_attach_mmc(struct mmc_host *host)
 		goto err;
 
 	mmc_release_host(host);
+
 	err = mmc_add_card(host->card);
-	mmc_claim_host(host);
 	if (err)
 		goto remove_card;
 
 	return 0;
 
 remove_card:
-	mmc_release_host(host);
 	mmc_remove_card(host->card);
-	mmc_claim_host(host);
 	host->card = NULL;
+	mmc_claim_host(host);
 err:
 	mmc_detach_bus(host);
+	mmc_release_host(host);
 
 	printk(KERN_ERR "%s: error %d whilst initialising MMC card\n",
 		mmc_hostname(host), err);
diff --git a/trunk/drivers/mmc/core/mmc_ops.c b/trunk/drivers/mmc/core/mmc_ops.c
index 60842f878ded..326447c9ede8 100644
--- a/trunk/drivers/mmc/core/mmc_ops.c
+++ b/trunk/drivers/mmc/core/mmc_ops.c
@@ -462,104 +462,3 @@ int mmc_send_status(struct mmc_card *card, u32 *status)
 	return 0;
 }
 
-static int
-mmc_send_bus_test(struct mmc_card *card, struct mmc_host *host, u8 opcode,
-		  u8 len)
-{
-	struct mmc_request mrq;
-	struct mmc_command cmd;
-	struct mmc_data data;
-	struct scatterlist sg;
-	u8 *data_buf;
-	u8 *test_buf;
-	int i, err;
-	static u8 testdata_8bit[8] = { 0x55, 0xaa, 0, 0, 0, 0, 0, 0 };
-	static u8 testdata_4bit[4] = { 0x5a, 0, 0, 0 };
-
-	/* dma onto stack is unsafe/nonportable, but callers to this
-	 * routine normally provide temporary on-stack buffers ...
-	 */
-	data_buf = kmalloc(len, GFP_KERNEL);
-	if (!data_buf)
-		return -ENOMEM;
-
-	if (len == 8)
-		test_buf = testdata_8bit;
-	else if (len == 4)
-		test_buf = testdata_4bit;
-	else {
-		printk(KERN_ERR "%s: Invalid bus_width %d\n",
-		       mmc_hostname(host), len);
-		kfree(data_buf);
-		return -EINVAL;
-	}
-
-	if (opcode == MMC_BUS_TEST_W)
-		memcpy(data_buf, test_buf, len);
-
-	memset(&mrq, 0, sizeof(struct mmc_request));
-	memset(&cmd, 0, sizeof(struct mmc_command));
-	memset(&data, 0, sizeof(struct mmc_data));
-
-	mrq.cmd = &cmd;
-	mrq.data = &data;
-	cmd.opcode = opcode;
-	cmd.arg = 0;
-
-	/* NOTE HACK:  the MMC_RSP_SPI_R1 is always correct here, but we
-	 * rely on callers to never use this with "native" calls for reading
-	 * CSD or CID.  Native versions of those commands use the R2 type,
-	 * not R1 plus a data block.
-	 */
-	cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC;
-
-	data.blksz = len;
-	data.blocks = 1;
-	if (opcode == MMC_BUS_TEST_R)
-		data.flags = MMC_DATA_READ;
-	else
-		data.flags = MMC_DATA_WRITE;
-
-	data.sg = &sg;
-	data.sg_len = 1;
-	sg_init_one(&sg, data_buf, len);
-	mmc_wait_for_req(host, &mrq);
-	err = 0;
-	if (opcode == MMC_BUS_TEST_R) {
-		for (i = 0; i < len / 4; i++)
-			if ((test_buf[i] ^ data_buf[i]) != 0xff) {
-				err = -EIO;
-				break;
-			}
-	}
-	kfree(data_buf);
-
-	if (cmd.error)
-		return cmd.error;
-	if (data.error)
-		return data.error;
-
-	return err;
-}
-
-int mmc_bus_test(struct mmc_card *card, u8 bus_width)
-{
-	int err, width;
-
-	if (bus_width == MMC_BUS_WIDTH_8)
-		width = 8;
-	else if (bus_width == MMC_BUS_WIDTH_4)
-		width = 4;
-	else if (bus_width == MMC_BUS_WIDTH_1)
-		return 0; /* no need for test */
-	else
-		return -EINVAL;
-
-	/*
-	 * Ignore errors from BUS_TEST_W.  BUS_TEST_R will fail if there
-	 * is a problem.  This improves chances that the test will work.
-	 */
-	mmc_send_bus_test(card, card->host, MMC_BUS_TEST_W, width);
-	err = mmc_send_bus_test(card, card->host, MMC_BUS_TEST_R, width);
-	return err;
-}
diff --git a/trunk/drivers/mmc/core/mmc_ops.h b/trunk/drivers/mmc/core/mmc_ops.h
index e6d44b8a18db..653eb8e84178 100644
--- a/trunk/drivers/mmc/core/mmc_ops.h
+++ b/trunk/drivers/mmc/core/mmc_ops.h
@@ -26,7 +26,6 @@ int mmc_send_cid(struct mmc_host *host, u32 *cid);
 int mmc_spi_read_ocr(struct mmc_host *host, int highcap, u32 *ocrp);
 int mmc_spi_set_crc(struct mmc_host *host, int use_crc);
 int mmc_card_sleepawake(struct mmc_host *host, int sleep);
-int mmc_bus_test(struct mmc_card *card, u8 bus_width);
 
 #endif
 
diff --git a/trunk/drivers/mmc/core/sd.c b/trunk/drivers/mmc/core/sd.c
index d18c32bca99b..49da4dffd28e 100644
--- a/trunk/drivers/mmc/core/sd.c
+++ b/trunk/drivers/mmc/core/sd.c
@@ -764,21 +764,14 @@ static void mmc_sd_attach_bus_ops(struct mmc_host *host)
 /*
  * Starting point for SD card init.
  */
-int mmc_attach_sd(struct mmc_host *host)
+int mmc_attach_sd(struct mmc_host *host, u32 ocr)
 {
 	int err;
-	u32 ocr;
 
 	BUG_ON(!host);
 	WARN_ON(!host->claimed);
 
-	err = mmc_send_app_op_cond(host, 0, &ocr);
-	if (err)
-		return err;
-
 	mmc_sd_attach_bus_ops(host);
-	if (host->ocr_avail_sd)
-		host->ocr_avail = host->ocr_avail_sd;
 
 	/*
 	 * We need to get OCR a different way for SPI.
@@ -802,8 +795,7 @@ int mmc_attach_sd(struct mmc_host *host)
 		ocr &= ~0x7F;
 	}
 
-	if ((ocr & MMC_VDD_165_195) &&
-	    !(host->ocr_avail_sd & MMC_VDD_165_195)) {
+	if (ocr & MMC_VDD_165_195) {
 		printk(KERN_WARNING "%s: SD card claims to support the "
 		       "incompletely defined 'low voltage range'. This "
 		       "will be ignored.\n", mmc_hostname(host));
@@ -828,20 +820,20 @@ int mmc_attach_sd(struct mmc_host *host)
 		goto err;
 
 	mmc_release_host(host);
+
 	err = mmc_add_card(host->card);
-	mmc_claim_host(host);
 	if (err)
 		goto remove_card;
 
 	return 0;
 
 remove_card:
-	mmc_release_host(host);
 	mmc_remove_card(host->card);
 	host->card = NULL;
 	mmc_claim_host(host);
 err:
 	mmc_detach_bus(host);
+	mmc_release_host(host);
 
 	printk(KERN_ERR "%s: error %d whilst initialising SD card\n",
 		mmc_hostname(host), err);
diff --git a/trunk/drivers/mmc/core/sdio.c b/trunk/drivers/mmc/core/sdio.c
index 5c4a54d9b6a4..efef5f94ac42 100644
--- a/trunk/drivers/mmc/core/sdio.c
+++ b/trunk/drivers/mmc/core/sdio.c
@@ -627,27 +627,15 @@ static int mmc_sdio_suspend(struct mmc_host *host)
 
 static int mmc_sdio_resume(struct mmc_host *host)
 {
-	int i, err = 0;
+	int i, err;
 
 	BUG_ON(!host);
 	BUG_ON(!host->card);
 
 	/* Basic card reinitialization. */
 	mmc_claim_host(host);
-
-	/* No need to reinitialize powered-resumed nonremovable cards */
-	if (mmc_card_is_removable(host) || !mmc_card_is_powered_resumed(host))
-		err = mmc_sdio_init_card(host, host->ocr, host->card,
+	err = mmc_sdio_init_card(host, host->ocr, host->card,
 				 (host->pm_flags & MMC_PM_KEEP_POWER));
-	else if (mmc_card_is_powered_resumed(host)) {
-		/* We may have switched to 1-bit mode during suspend */
-		err = sdio_enable_4bit_bus(host->card);
-		if (err > 0) {
-			mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
-			err = 0;
-		}
-	}
-
 	if (!err && host->sdio_irqs)
 		mmc_signal_sdio_irq(host);
 	mmc_release_host(host);
@@ -702,22 +690,16 @@ static const struct mmc_bus_ops mmc_sdio_ops = {
 /*
  * Starting point for SDIO card init.
  */
-int mmc_attach_sdio(struct mmc_host *host)
+int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
 {
-	int err, i, funcs;
-	u32 ocr;
+	int err;
+	int i, funcs;
 	struct mmc_card *card;
 
 	BUG_ON(!host);
 	WARN_ON(!host->claimed);
 
-	err = mmc_send_io_op_cond(host, 0, &ocr);
-	if (err)
-		return err;
-
 	mmc_attach_bus(host, &mmc_sdio_ops);
-	if (host->ocr_avail_sdio)
-		host->ocr_avail = host->ocr_avail_sdio;
 
 	/*
 	 * Sanity check the voltages that the card claims to
@@ -787,12 +769,12 @@ int mmc_attach_sdio(struct mmc_host *host)
 			pm_runtime_enable(&card->sdio_func[i]->dev);
 	}
 
+	mmc_release_host(host);
+
 	/*
 	 * First add the card to the driver model...
 	 */
-	mmc_release_host(host);
 	err = mmc_add_card(host->card);
-	mmc_claim_host(host);
 	if (err)
 		goto remove_added;
 
@@ -810,17 +792,15 @@ int mmc_attach_sdio(struct mmc_host *host)
 
 remove_added:
 	/* Remove without lock if the device has been added. */
-	mmc_release_host(host);
 	mmc_sdio_remove(host);
 	mmc_claim_host(host);
 remove:
 	/* And with lock if it hasn't been added. */
-	mmc_release_host(host);
 	if (host->card)
 		mmc_sdio_remove(host);
-	mmc_claim_host(host);
 err:
 	mmc_detach_bus(host);
+	mmc_release_host(host);
 
 	printk(KERN_ERR "%s: error %d whilst initialising SDIO card\n",
 		mmc_hostname(host), err);
diff --git a/trunk/drivers/mmc/core/sdio_bus.c b/trunk/drivers/mmc/core/sdio_bus.c
index d29b9c36919a..203da443e339 100644
--- a/trunk/drivers/mmc/core/sdio_bus.c
+++ b/trunk/drivers/mmc/core/sdio_bus.c
@@ -197,12 +197,44 @@ static int sdio_bus_remove(struct device *dev)
 
 #ifdef CONFIG_PM_RUNTIME
 
+static int sdio_bus_pm_prepare(struct device *dev)
+{
+	struct sdio_func *func = dev_to_sdio_func(dev);
+
+	/*
+	 * Resume an SDIO device which was suspended at run time at this
+	 * point, in order to allow standard SDIO suspend/resume paths
+	 * to keep working as usual.
+	 *
+	 * Ultimately, the SDIO driver itself will decide (in its
+	 * suspend handler, or lack thereof) whether the card should be
+	 * removed or kept, and if kept, at what power state.
+	 *
+	 * At this point, PM core have increased our use count, so it's
+	 * safe to directly resume the device. After system is resumed
+	 * again, PM core will drop back its runtime PM use count, and if
+	 * needed device will be suspended again.
+	 *
+	 * The end result is guaranteed to be a power state that is
+	 * coherent with the device's runtime PM use count.
+	 *
+	 * The return value of pm_runtime_resume is deliberately unchecked
+	 * since there is little point in failing system suspend if a
+	 * device can't be resumed.
+	 */
+	if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD)
+		pm_runtime_resume(dev);
+
+	return 0;
+}
+
 static const struct dev_pm_ops sdio_bus_pm_ops = {
 	SET_RUNTIME_PM_OPS(
 		pm_generic_runtime_suspend,
 		pm_generic_runtime_resume,
 		pm_generic_runtime_idle
 	)
+	.prepare = sdio_bus_pm_prepare,
 };
 
 #define SDIO_PM_OPS_PTR	(&sdio_bus_pm_ops)
diff --git a/trunk/drivers/mmc/host/Kconfig b/trunk/drivers/mmc/host/Kconfig
index c22a4c039988..e960a9300eb2 100644
--- a/trunk/drivers/mmc/host/Kconfig
+++ b/trunk/drivers/mmc/host/Kconfig
@@ -142,27 +142,6 @@ config MMC_SDHCI_ESDHC_IMX
 
 	  If unsure, say N.
 
-config MMC_SDHCI_DOVE
-	bool "SDHCI support on Marvell's Dove SoC"
-	depends on ARCH_DOVE
-	depends on MMC_SDHCI_PLTFM
-	select MMC_SDHCI_IO_ACCESSORS
-	help
-	  This selects the Secure Digital Host Controller Interface in
-	  Marvell's Dove SoC.
-
-	  If unsure, say N.
-
-config MMC_SDHCI_TEGRA
-	tristate "SDHCI platform support for the Tegra SD/MMC Controller"
-	depends on MMC_SDHCI_PLTFM && ARCH_TEGRA
-	select MMC_SDHCI_IO_ACCESSORS
-	help
-	  This selects the Tegra SD/MMC controller. If you have a Tegra
-	  platform with SD or MMC devices, say Y or M here.
-
-	  If unsure, say N.
-
 config MMC_SDHCI_S3C
 	tristate "SDHCI support on Samsung S3C SoC"
 	depends on MMC_SDHCI && PLAT_SAMSUNG
@@ -481,22 +460,6 @@ config SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND
 	help
 	  If you say yes here SD-Cards may work on the EZkit.
 
-config MMC_DW
-	tristate "Synopsys DesignWare Memory Card Interface"
-	depends on ARM
-	help
-	  This selects support for the Synopsys DesignWare Mobile Storage IP
-	  block, this provides host support for SD and MMC interfaces, in both
-	  PIO and external DMA modes.
-
-config MMC_DW_IDMAC
-	bool "Internal DMAC interface"
-	depends on MMC_DW
-	help
-	  This selects support for the internal DMAC block within the Synopsys
-	  Designware Mobile Storage IP block. This disables the external DMA
-	  interface.
-
 config MMC_SH_MMCIF
 	tristate "SuperH Internal MMCIF support"
 	depends on MMC_BLOCK && (SUPERH || ARCH_SHMOBILE)
diff --git a/trunk/drivers/mmc/host/Makefile b/trunk/drivers/mmc/host/Makefile
index e834fb223e9a..7b645ff43b30 100644
--- a/trunk/drivers/mmc/host/Makefile
+++ b/trunk/drivers/mmc/host/Makefile
@@ -31,7 +31,6 @@ obj-$(CONFIG_MMC_TMIO)		+= tmio_mmc.o
 obj-$(CONFIG_MMC_CB710)	+= cb710-mmc.o
 obj-$(CONFIG_MMC_VIA_SDMMC)	+= via-sdmmc.o
 obj-$(CONFIG_SDH_BFIN)		+= bfin_sdh.o
-obj-$(CONFIG_MMC_DW)		+= dw_mmc.o
 obj-$(CONFIG_MMC_SH_MMCIF)	+= sh_mmcif.o
 obj-$(CONFIG_MMC_JZ4740)	+= jz4740_mmc.o
 obj-$(CONFIG_MMC_USHC)		+= ushc.o
@@ -40,8 +39,6 @@ obj-$(CONFIG_MMC_SDHCI_PLTFM)			+= sdhci-platform.o
 sdhci-platform-y				:= sdhci-pltfm.o
 sdhci-platform-$(CONFIG_MMC_SDHCI_CNS3XXX)	+= sdhci-cns3xxx.o
 sdhci-platform-$(CONFIG_MMC_SDHCI_ESDHC_IMX)	+= sdhci-esdhc-imx.o
-sdhci-platform-$(CONFIG_MMC_SDHCI_DOVE)		+= sdhci-dove.o
-sdhci-platform-$(CONFIG_MMC_SDHCI_TEGRA)	+= sdhci-tegra.o
 
 obj-$(CONFIG_MMC_SDHCI_OF)	+= sdhci-of.o
 sdhci-of-y				:= sdhci-of-core.o
diff --git a/trunk/drivers/mmc/host/davinci_mmc.c b/trunk/drivers/mmc/host/davinci_mmc.c
index 0076c7448fe6..e15547cf701f 100644
--- a/trunk/drivers/mmc/host/davinci_mmc.c
+++ b/trunk/drivers/mmc/host/davinci_mmc.c
@@ -66,8 +66,8 @@
 #define DAVINCI_MMCBLNC      0x60
 #define DAVINCI_SDIOCTL      0x64
 #define DAVINCI_SDIOST0      0x68
-#define DAVINCI_SDIOIEN      0x6C
-#define DAVINCI_SDIOIST      0x70
+#define DAVINCI_SDIOEN       0x6C
+#define DAVINCI_SDIOST       0x70
 #define DAVINCI_MMCFIFOCTL   0x74 /* FIFO Control Register             */
 
 /* DAVINCI_MMCCTL definitions */
@@ -131,14 +131,6 @@
 #define MMCFIFOCTL_ACCWD_2    (2 << 3) /* access width of 2 bytes    */
 #define MMCFIFOCTL_ACCWD_1    (3 << 3) /* access width of 1 byte     */
 
-/* DAVINCI_SDIOST0 definitions */
-#define SDIOST0_DAT1_HI       BIT(0)
-
-/* DAVINCI_SDIOIEN definitions */
-#define SDIOIEN_IOINTEN       BIT(0)
-
-/* DAVINCI_SDIOIST definitions */
-#define SDIOIST_IOINT         BIT(0)
 
 /* MMCSD Init clock in Hz in opendrain mode */
 #define MMCSD_INIT_CLOCK		200000
@@ -172,7 +164,7 @@ struct mmc_davinci_host {
 	unsigned int mmc_input_clk;
 	void __iomem *base;
 	struct resource *mem_res;
-	int mmc_irq, sdio_irq;
+	int irq;
 	unsigned char bus_mode;
 
 #define DAVINCI_MMC_DATADIR_NONE	0
@@ -192,7 +184,6 @@ struct mmc_davinci_host {
 	u32 rxdma, txdma;
 	bool use_dma;
 	bool do_dma;
-	bool sdio_int;
 
 	/* Scatterlist DMA uses one or more parameter RAM entries:
 	 * the main one (associated with rxdma or txdma) plus zero or
@@ -489,7 +480,7 @@ static void mmc_davinci_send_dma_request(struct mmc_davinci_host *host,
 	struct scatterlist	*sg;
 	unsigned		sg_len;
 	unsigned		bytes_left = host->bytes_left;
-	const unsigned		shift = ffs(rw_threshold) - 1;
+	const unsigned		shift = ffs(rw_threshold) - 1;;
 
 	if (host->data_dir == DAVINCI_MMC_DATADIR_WRITE) {
 		template = &host->tx_template;
@@ -875,19 +866,6 @@ mmc_davinci_xfer_done(struct mmc_davinci_host *host, struct mmc_data *data)
 {
 	host->data = NULL;
 
-	if (host->mmc->caps & MMC_CAP_SDIO_IRQ) {
-		/*
-		 * SDIO Interrupt Detection work-around as suggested by
-		 * Davinci Errata (TMS320DM355 Silicon Revision 1.1 Errata
-		 * 2.1.6): Signal SDIO interrupt only if it is enabled by core
-		 */
-		if (host->sdio_int && !(readl(host->base + DAVINCI_SDIOST0) &
-					SDIOST0_DAT1_HI)) {
-			writel(SDIOIST_IOINT, host->base + DAVINCI_SDIOIST);
-			mmc_signal_sdio_irq(host->mmc);
-		}
-	}
-
 	if (host->do_dma) {
 		davinci_abort_dma(host);
 
@@ -954,21 +932,6 @@ davinci_abort_data(struct mmc_davinci_host *host, struct mmc_data *data)
 	mmc_davinci_reset_ctrl(host, 0);
 }
 
-static irqreturn_t mmc_davinci_sdio_irq(int irq, void *dev_id)
-{
-	struct mmc_davinci_host *host = dev_id;
-	unsigned int status;
-
-	status = readl(host->base + DAVINCI_SDIOIST);
-	if (status & SDIOIST_IOINT) {
-		dev_dbg(mmc_dev(host->mmc),
-			"SDIO interrupt status %x\n", status);
-		writel(status | SDIOIST_IOINT, host->base + DAVINCI_SDIOIST);
-		mmc_signal_sdio_irq(host->mmc);
-	}
-	return IRQ_HANDLED;
-}
-
 static irqreturn_t mmc_davinci_irq(int irq, void *dev_id)
 {
 	struct mmc_davinci_host *host = (struct mmc_davinci_host *)dev_id;
@@ -1113,32 +1076,11 @@ static int mmc_davinci_get_ro(struct mmc_host *mmc)
 	return config->get_ro(pdev->id);
 }
 
-static void mmc_davinci_enable_sdio_irq(struct mmc_host *mmc, int enable)
-{
-	struct mmc_davinci_host *host = mmc_priv(mmc);
-
-	if (enable) {
-		if (!(readl(host->base + DAVINCI_SDIOST0) & SDIOST0_DAT1_HI)) {
-			writel(SDIOIST_IOINT, host->base + DAVINCI_SDIOIST);
-			mmc_signal_sdio_irq(host->mmc);
-		} else {
-			host->sdio_int = true;
-			writel(readl(host->base + DAVINCI_SDIOIEN) |
-			       SDIOIEN_IOINTEN, host->base + DAVINCI_SDIOIEN);
-		}
-	} else {
-		host->sdio_int = false;
-		writel(readl(host->base + DAVINCI_SDIOIEN) & ~SDIOIEN_IOINTEN,
-		       host->base + DAVINCI_SDIOIEN);
-	}
-}
-
 static struct mmc_host_ops mmc_davinci_ops = {
 	.request	= mmc_davinci_request,
 	.set_ios	= mmc_davinci_set_ios,
 	.get_cd		= mmc_davinci_get_cd,
 	.get_ro		= mmc_davinci_get_ro,
-	.enable_sdio_irq = mmc_davinci_enable_sdio_irq,
 };
 
 /*----------------------------------------------------------------------*/
@@ -1267,8 +1209,7 @@ static int __init davinci_mmcsd_probe(struct platform_device *pdev)
 		host->nr_sg = MAX_NR_SG;
 
 	host->use_dma = use_dma;
-	host->mmc_irq = irq;
-	host->sdio_irq = platform_get_irq(pdev, 1);
+	host->irq = irq;
 
 	if (host->use_dma && davinci_acquire_dma_channels(host) != 0)
 		host->use_dma = 0;
@@ -1329,13 +1270,6 @@ static int __init davinci_mmcsd_probe(struct platform_device *pdev)
 	if (ret)
 		goto out;
 
-	if (host->sdio_irq >= 0) {
-		ret = request_irq(host->sdio_irq, mmc_davinci_sdio_irq, 0,
-				  mmc_hostname(mmc), host);
-		if (!ret)
-			mmc->caps |= MMC_CAP_SDIO_IRQ;
-	}
-
 	rename_region(mem, mmc_hostname(mmc));
 
 	dev_info(mmc_dev(host->mmc), "Using %s, %d-bit mode\n",
@@ -1379,9 +1313,7 @@ static int __exit davinci_mmcsd_remove(struct platform_device *pdev)
 		mmc_davinci_cpufreq_deregister(host);
 
 		mmc_remove_host(host->mmc);
-		free_irq(host->mmc_irq, host);
-		if (host->mmc->caps & MMC_CAP_SDIO_IRQ)
-			free_irq(host->sdio_irq, host);
+		free_irq(host->irq, host);
 
 		davinci_release_dma_channels(host);
 
diff --git a/trunk/drivers/mmc/host/dw_mmc.c b/trunk/drivers/mmc/host/dw_mmc.c
deleted file mode 100644
index 2fcc82577c1b..000000000000
--- a/trunk/drivers/mmc/host/dw_mmc.c
+++ /dev/null
@@ -1,1796 +0,0 @@
-/*
- * Synopsys DesignWare Multimedia Card Interface driver
- *  (Based on NXP driver for lpc 31xx)
- *
- * Copyright (C) 2009 NXP Semiconductors
- * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/blkdev.h>
-#include <linux/clk.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/scatterlist.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/delay.h>
-#include <linux/irq.h>
-#include <linux/mmc/host.h>
-#include <linux/mmc/mmc.h>
-#include <linux/mmc/dw_mmc.h>
-#include <linux/bitops.h>
-
-#include "dw_mmc.h"
-
-/* Common flag combinations */
-#define DW_MCI_DATA_ERROR_FLAGS	(SDMMC_INT_DTO | SDMMC_INT_DCRC | \
-				 SDMMC_INT_HTO | SDMMC_INT_SBE  | \
-				 SDMMC_INT_EBE)
-#define DW_MCI_CMD_ERROR_FLAGS	(SDMMC_INT_RTO | SDMMC_INT_RCRC | \
-				 SDMMC_INT_RESP_ERR)
-#define DW_MCI_ERROR_FLAGS	(DW_MCI_DATA_ERROR_FLAGS | \
-				 DW_MCI_CMD_ERROR_FLAGS  | SDMMC_INT_HLE)
-#define DW_MCI_SEND_STATUS	1
-#define DW_MCI_RECV_STATUS	2
-#define DW_MCI_DMA_THRESHOLD	16
-
-#ifdef CONFIG_MMC_DW_IDMAC
-struct idmac_desc {
-	u32		des0;	/* Control Descriptor */
-#define IDMAC_DES0_DIC	BIT(1)
-#define IDMAC_DES0_LD	BIT(2)
-#define IDMAC_DES0_FD	BIT(3)
-#define IDMAC_DES0_CH	BIT(4)
-#define IDMAC_DES0_ER	BIT(5)
-#define IDMAC_DES0_CES	BIT(30)
-#define IDMAC_DES0_OWN	BIT(31)
-
-	u32		des1;	/* Buffer sizes */
-#define IDMAC_SET_BUFFER1_SIZE(d, s) \
-	((d)->des1 = ((d)->des1 & 0x03ffc000) | ((s) & 0x3fff))
-
-	u32		des2;	/* buffer 1 physical address */
-
-	u32		des3;	/* buffer 2 physical address */
-};
-#endif /* CONFIG_MMC_DW_IDMAC */
-
-/**
- * struct dw_mci_slot - MMC slot state
- * @mmc: The mmc_host representing this slot.
- * @host: The MMC controller this slot is using.
- * @ctype: Card type for this slot.
- * @mrq: mmc_request currently being processed or waiting to be
- *	processed, or NULL when the slot is idle.
- * @queue_node: List node for placing this node in the @queue list of
- *	&struct dw_mci.
- * @clock: Clock rate configured by set_ios(). Protected by host->lock.
- * @flags: Random state bits associated with the slot.
- * @id: Number of this slot.
- * @last_detect_state: Most recently observed card detect state.
- */
-struct dw_mci_slot {
-	struct mmc_host		*mmc;
-	struct dw_mci		*host;
-
-	u32			ctype;
-
-	struct mmc_request	*mrq;
-	struct list_head	queue_node;
-
-	unsigned int		clock;
-	unsigned long		flags;
-#define DW_MMC_CARD_PRESENT	0
-#define DW_MMC_CARD_NEED_INIT	1
-	int			id;
-	int			last_detect_state;
-};
-
-#if defined(CONFIG_DEBUG_FS)
-static int dw_mci_req_show(struct seq_file *s, void *v)
-{
-	struct dw_mci_slot *slot = s->private;
-	struct mmc_request *mrq;
-	struct mmc_command *cmd;
-	struct mmc_command *stop;
-	struct mmc_data	*data;
-
-	/* Make sure we get a consistent snapshot */
-	spin_lock_bh(&slot->host->lock);
-	mrq = slot->mrq;
-
-	if (mrq) {
-		cmd = mrq->cmd;
-		data = mrq->data;
-		stop = mrq->stop;
-
-		if (cmd)
-			seq_printf(s,
-				   "CMD%u(0x%x) flg %x rsp %x %x %x %x err %d\n",
-				   cmd->opcode, cmd->arg, cmd->flags,
-				   cmd->resp[0], cmd->resp[1], cmd->resp[2],
-				   cmd->resp[2], cmd->error);
-		if (data)
-			seq_printf(s, "DATA %u / %u * %u flg %x err %d\n",
-				   data->bytes_xfered, data->blocks,
-				   data->blksz, data->flags, data->error);
-		if (stop)
-			seq_printf(s,
-				   "CMD%u(0x%x) flg %x rsp %x %x %x %x err %d\n",
-				   stop->opcode, stop->arg, stop->flags,
-				   stop->resp[0], stop->resp[1], stop->resp[2],
-				   stop->resp[2], stop->error);
-	}
-
-	spin_unlock_bh(&slot->host->lock);
-
-	return 0;
-}
-
-static int dw_mci_req_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dw_mci_req_show, inode->i_private);
-}
-
-static const struct file_operations dw_mci_req_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dw_mci_req_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int dw_mci_regs_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "STATUS:\t0x%08x\n", SDMMC_STATUS);
-	seq_printf(s, "RINTSTS:\t0x%08x\n", SDMMC_RINTSTS);
-	seq_printf(s, "CMD:\t0x%08x\n", SDMMC_CMD);
-	seq_printf(s, "CTRL:\t0x%08x\n", SDMMC_CTRL);
-	seq_printf(s, "INTMASK:\t0x%08x\n", SDMMC_INTMASK);
-	seq_printf(s, "CLKENA:\t0x%08x\n", SDMMC_CLKENA);
-
-	return 0;
-}
-
-static int dw_mci_regs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, dw_mci_regs_show, inode->i_private);
-}
-
-static const struct file_operations dw_mci_regs_fops = {
-	.owner		= THIS_MODULE,
-	.open		= dw_mci_regs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static void dw_mci_init_debugfs(struct dw_mci_slot *slot)
-{
-	struct mmc_host	*mmc = slot->mmc;
-	struct dw_mci *host = slot->host;
-	struct dentry *root;
-	struct dentry *node;
-
-	root = mmc->debugfs_root;
-	if (!root)
-		return;
-
-	node = debugfs_create_file("regs", S_IRUSR, root, host,
-				   &dw_mci_regs_fops);
-	if (!node)
-		goto err;
-
-	node = debugfs_create_file("req", S_IRUSR, root, slot,
-				   &dw_mci_req_fops);
-	if (!node)
-		goto err;
-
-	node = debugfs_create_u32("state", S_IRUSR, root, (u32 *)&host->state);
-	if (!node)
-		goto err;
-
-	node = debugfs_create_x32("pending_events", S_IRUSR, root,
-				  (u32 *)&host->pending_events);
-	if (!node)
-		goto err;
-
-	node = debugfs_create_x32("completed_events", S_IRUSR, root,
-				  (u32 *)&host->completed_events);
-	if (!node)
-		goto err;
-
-	return;
-
-err:
-	dev_err(&mmc->class_dev, "failed to initialize debugfs for slot\n");
-}
-#endif /* defined(CONFIG_DEBUG_FS) */
-
-static void dw_mci_set_timeout(struct dw_mci *host)
-{
-	/* timeout (maximum) */
-	mci_writel(host, TMOUT, 0xffffffff);
-}
-
-static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd)
-{
-	struct mmc_data	*data;
-	u32 cmdr;
-	cmd->error = -EINPROGRESS;
-
-	cmdr = cmd->opcode;
-
-	if (cmdr == MMC_STOP_TRANSMISSION)
-		cmdr |= SDMMC_CMD_STOP;
-	else
-		cmdr |= SDMMC_CMD_PRV_DAT_WAIT;
-
-	if (cmd->flags & MMC_RSP_PRESENT) {
-		/* We expect a response, so set this bit */
-		cmdr |= SDMMC_CMD_RESP_EXP;
-		if (cmd->flags & MMC_RSP_136)
-			cmdr |= SDMMC_CMD_RESP_LONG;
-	}
-
-	if (cmd->flags & MMC_RSP_CRC)
-		cmdr |= SDMMC_CMD_RESP_CRC;
-
-	data = cmd->data;
-	if (data) {
-		cmdr |= SDMMC_CMD_DAT_EXP;
-		if (data->flags & MMC_DATA_STREAM)
-			cmdr |= SDMMC_CMD_STRM_MODE;
-		if (data->flags & MMC_DATA_WRITE)
-			cmdr |= SDMMC_CMD_DAT_WR;
-	}
-
-	return cmdr;
-}
-
-static void dw_mci_start_command(struct dw_mci *host,
-				 struct mmc_command *cmd, u32 cmd_flags)
-{
-	host->cmd = cmd;
-	dev_vdbg(&host->pdev->dev,
-		 "start command: ARGR=0x%08x CMDR=0x%08x\n",
-		 cmd->arg, cmd_flags);
-
-	mci_writel(host, CMDARG, cmd->arg);
-	wmb();
-
-	mci_writel(host, CMD, cmd_flags | SDMMC_CMD_START);
-}
-
-static void send_stop_cmd(struct dw_mci *host, struct mmc_data *data)
-{
-	dw_mci_start_command(host, data->stop, host->stop_cmdr);
-}
-
-/* DMA interface functions */
-static void dw_mci_stop_dma(struct dw_mci *host)
-{
-	if (host->use_dma) {
-		host->dma_ops->stop(host);
-		host->dma_ops->cleanup(host);
-	} else {
-		/* Data transfer was stopped by the interrupt handler */
-		set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
-	}
-}
-
-#ifdef CONFIG_MMC_DW_IDMAC
-static void dw_mci_dma_cleanup(struct dw_mci *host)
-{
-	struct mmc_data *data = host->data;
-
-	if (data)
-		dma_unmap_sg(&host->pdev->dev, data->sg, data->sg_len,
-			     ((data->flags & MMC_DATA_WRITE)
-			      ? DMA_TO_DEVICE : DMA_FROM_DEVICE));
-}
-
-static void dw_mci_idmac_stop_dma(struct dw_mci *host)
-{
-	u32 temp;
-
-	/* Disable and reset the IDMAC interface */
-	temp = mci_readl(host, CTRL);
-	temp &= ~SDMMC_CTRL_USE_IDMAC;
-	temp |= SDMMC_CTRL_DMA_RESET;
-	mci_writel(host, CTRL, temp);
-
-	/* Stop the IDMAC running */
-	temp = mci_readl(host, BMOD);
-	temp &= ~SDMMC_IDMAC_ENABLE;
-	mci_writel(host, BMOD, temp);
-}
-
-static void dw_mci_idmac_complete_dma(struct dw_mci *host)
-{
-	struct mmc_data *data = host->data;
-
-	dev_vdbg(&host->pdev->dev, "DMA complete\n");
-
-	host->dma_ops->cleanup(host);
-
-	/*
-	 * If the card was removed, data will be NULL. No point in trying to
-	 * send the stop command or waiting for NBUSY in this case.
-	 */
-	if (data) {
-		set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
-		tasklet_schedule(&host->tasklet);
-	}
-}
-
-static void dw_mci_translate_sglist(struct dw_mci *host, struct mmc_data *data,
-				    unsigned int sg_len)
-{
-	int i;
-	struct idmac_desc *desc = host->sg_cpu;
-
-	for (i = 0; i < sg_len; i++, desc++) {
-		unsigned int length = sg_dma_len(&data->sg[i]);
-		u32 mem_addr = sg_dma_address(&data->sg[i]);
-
-		/* Set the OWN bit and disable interrupts for this descriptor */
-		desc->des0 = IDMAC_DES0_OWN | IDMAC_DES0_DIC | IDMAC_DES0_CH;
-
-		/* Buffer length */
-		IDMAC_SET_BUFFER1_SIZE(desc, length);
-
-		/* Physical address to DMA to/from */
-		desc->des2 = mem_addr;
-	}
-
-	/* Set first descriptor */
-	desc = host->sg_cpu;
-	desc->des0 |= IDMAC_DES0_FD;
-
-	/* Set last descriptor */
-	desc = host->sg_cpu + (i - 1) * sizeof(struct idmac_desc);
-	desc->des0 &= ~(IDMAC_DES0_CH | IDMAC_DES0_DIC);
-	desc->des0 |= IDMAC_DES0_LD;
-
-	wmb();
-}
-
-static void dw_mci_idmac_start_dma(struct dw_mci *host, unsigned int sg_len)
-{
-	u32 temp;
-
-	dw_mci_translate_sglist(host, host->data, sg_len);
-
-	/* Select IDMAC interface */
-	temp = mci_readl(host, CTRL);
-	temp |= SDMMC_CTRL_USE_IDMAC;
-	mci_writel(host, CTRL, temp);
-
-	wmb();
-
-	/* Enable the IDMAC */
-	temp = mci_readl(host, BMOD);
-	temp |= SDMMC_IDMAC_ENABLE;
-	mci_writel(host, BMOD, temp);
-
-	/* Start it running */
-	mci_writel(host, PLDMND, 1);
-}
-
-static int dw_mci_idmac_init(struct dw_mci *host)
-{
-	struct idmac_desc *p;
-	int i;
-
-	/* Number of descriptors in the ring buffer */
-	host->ring_size = PAGE_SIZE / sizeof(struct idmac_desc);
-
-	/* Forward link the descriptor list */
-	for (i = 0, p = host->sg_cpu; i < host->ring_size - 1; i++, p++)
-		p->des3 = host->sg_dma + (sizeof(struct idmac_desc) * (i + 1));
-
-	/* Set the last descriptor as the end-of-ring descriptor */
-	p->des3 = host->sg_dma;
-	p->des0 = IDMAC_DES0_ER;
-
-	/* Mask out interrupts - get Tx & Rx complete only */
-	mci_writel(host, IDINTEN, SDMMC_IDMAC_INT_NI | SDMMC_IDMAC_INT_RI |
-		   SDMMC_IDMAC_INT_TI);
-
-	/* Set the descriptor base address */
-	mci_writel(host, DBADDR, host->sg_dma);
-	return 0;
-}
-
-static struct dw_mci_dma_ops dw_mci_idmac_ops = {
-	.init = dw_mci_idmac_init,
-	.start = dw_mci_idmac_start_dma,
-	.stop = dw_mci_idmac_stop_dma,
-	.complete = dw_mci_idmac_complete_dma,
-	.cleanup = dw_mci_dma_cleanup,
-};
-#endif /* CONFIG_MMC_DW_IDMAC */
-
-static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
-{
-	struct scatterlist *sg;
-	unsigned int i, direction, sg_len;
-	u32 temp;
-
-	/* If we don't have a channel, we can't do DMA */
-	if (!host->use_dma)
-		return -ENODEV;
-
-	/*
-	 * We don't do DMA on "complex" transfers, i.e. with
-	 * non-word-aligned buffers or lengths. Also, we don't bother
-	 * with all the DMA setup overhead for short transfers.
-	 */
-	if (data->blocks * data->blksz < DW_MCI_DMA_THRESHOLD)
-		return -EINVAL;
-	if (data->blksz & 3)
-		return -EINVAL;
-
-	for_each_sg(data->sg, sg, data->sg_len, i) {
-		if (sg->offset & 3 || sg->length & 3)
-			return -EINVAL;
-	}
-
-	if (data->flags & MMC_DATA_READ)
-		direction = DMA_FROM_DEVICE;
-	else
-		direction = DMA_TO_DEVICE;
-
-	sg_len = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len,
-			    direction);
-
-	dev_vdbg(&host->pdev->dev,
-		 "sd sg_cpu: %#lx sg_dma: %#lx sg_len: %d\n",
-		 (unsigned long)host->sg_cpu, (unsigned long)host->sg_dma,
-		 sg_len);
-
-	/* Enable the DMA interface */
-	temp = mci_readl(host, CTRL);
-	temp |= SDMMC_CTRL_DMA_ENABLE;
-	mci_writel(host, CTRL, temp);
-
-	/* Disable RX/TX IRQs, let DMA handle it */
-	temp = mci_readl(host, INTMASK);
-	temp  &= ~(SDMMC_INT_RXDR | SDMMC_INT_TXDR);
-	mci_writel(host, INTMASK, temp);
-
-	host->dma_ops->start(host, sg_len);
-
-	return 0;
-}
-
-static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
-{
-	u32 temp;
-
-	data->error = -EINPROGRESS;
-
-	WARN_ON(host->data);
-	host->sg = NULL;
-	host->data = data;
-
-	if (dw_mci_submit_data_dma(host, data)) {
-		host->sg = data->sg;
-		host->pio_offset = 0;
-		if (data->flags & MMC_DATA_READ)
-			host->dir_status = DW_MCI_RECV_STATUS;
-		else
-			host->dir_status = DW_MCI_SEND_STATUS;
-
-		temp = mci_readl(host, INTMASK);
-		temp |= SDMMC_INT_TXDR | SDMMC_INT_RXDR;
-		mci_writel(host, INTMASK, temp);
-
-		temp = mci_readl(host, CTRL);
-		temp &= ~SDMMC_CTRL_DMA_ENABLE;
-		mci_writel(host, CTRL, temp);
-	}
-}
-
-static void mci_send_cmd(struct dw_mci_slot *slot, u32 cmd, u32 arg)
-{
-	struct dw_mci *host = slot->host;
-	unsigned long timeout = jiffies + msecs_to_jiffies(500);
-	unsigned int cmd_status = 0;
-
-	mci_writel(host, CMDARG, arg);
-	wmb();
-	mci_writel(host, CMD, SDMMC_CMD_START | cmd);
-
-	while (time_before(jiffies, timeout)) {
-		cmd_status = mci_readl(host, CMD);
-		if (!(cmd_status & SDMMC_CMD_START))
-			return;
-	}
-	dev_err(&slot->mmc->class_dev,
-		"Timeout sending command (cmd %#x arg %#x status %#x)\n",
-		cmd, arg, cmd_status);
-}
-
-static void dw_mci_setup_bus(struct dw_mci_slot *slot)
-{
-	struct dw_mci *host = slot->host;
-	u32 div;
-
-	if (slot->clock != host->current_speed) {
-		if (host->bus_hz % slot->clock)
-			/*
-			 * move the + 1 after the divide to prevent
-			 * over-clocking the card.
-			 */
-			div = ((host->bus_hz / slot->clock) >> 1) + 1;
-		else
-			div = (host->bus_hz  / slot->clock) >> 1;
-
-		dev_info(&slot->mmc->class_dev,
-			 "Bus speed (slot %d) = %dHz (slot req %dHz, actual %dHZ"
-			 " div = %d)\n", slot->id, host->bus_hz, slot->clock,
-			 div ? ((host->bus_hz / div) >> 1) : host->bus_hz, div);
-
-		/* disable clock */
-		mci_writel(host, CLKENA, 0);
-		mci_writel(host, CLKSRC, 0);
-
-		/* inform CIU */
-		mci_send_cmd(slot,
-			     SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
-
-		/* set clock to desired speed */
-		mci_writel(host, CLKDIV, div);
-
-		/* inform CIU */
-		mci_send_cmd(slot,
-			     SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
-
-		/* enable clock */
-		mci_writel(host, CLKENA, SDMMC_CLKEN_ENABLE);
-
-		/* inform CIU */
-		mci_send_cmd(slot,
-			     SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
-
-		host->current_speed = slot->clock;
-	}
-
-	/* Set the current slot bus width */
-	mci_writel(host, CTYPE, slot->ctype);
-}
-
-static void dw_mci_start_request(struct dw_mci *host,
-				 struct dw_mci_slot *slot)
-{
-	struct mmc_request *mrq;
-	struct mmc_command *cmd;
-	struct mmc_data	*data;
-	u32 cmdflags;
-
-	mrq = slot->mrq;
-	if (host->pdata->select_slot)
-		host->pdata->select_slot(slot->id);
-
-	/* Slot specific timing and width adjustment */
-	dw_mci_setup_bus(slot);
-
-	host->cur_slot = slot;
-	host->mrq = mrq;
-
-	host->pending_events = 0;
-	host->completed_events = 0;
-	host->data_status = 0;
-
-	data = mrq->data;
-	if (data) {
-		dw_mci_set_timeout(host);
-		mci_writel(host, BYTCNT, data->blksz*data->blocks);
-		mci_writel(host, BLKSIZ, data->blksz);
-	}
-
-	cmd = mrq->cmd;
-	cmdflags = dw_mci_prepare_command(slot->mmc, cmd);
-
-	/* this is the first command, send the initialization clock */
-	if (test_and_clear_bit(DW_MMC_CARD_NEED_INIT, &slot->flags))
-		cmdflags |= SDMMC_CMD_INIT;
-
-	if (data) {
-		dw_mci_submit_data(host, data);
-		wmb();
-	}
-
-	dw_mci_start_command(host, cmd, cmdflags);
-
-	if (mrq->stop)
-		host->stop_cmdr = dw_mci_prepare_command(slot->mmc, mrq->stop);
-}
-
-static void dw_mci_queue_request(struct dw_mci *host, struct dw_mci_slot *slot,
-				 struct mmc_request *mrq)
-{
-	dev_vdbg(&slot->mmc->class_dev, "queue request: state=%d\n",
-		 host->state);
-
-	spin_lock_bh(&host->lock);
-	slot->mrq = mrq;
-
-	if (host->state == STATE_IDLE) {
-		host->state = STATE_SENDING_CMD;
-		dw_mci_start_request(host, slot);
-	} else {
-		list_add_tail(&slot->queue_node, &host->queue);
-	}
-
-	spin_unlock_bh(&host->lock);
-}
-
-static void dw_mci_request(struct mmc_host *mmc, struct mmc_request *mrq)
-{
-	struct dw_mci_slot *slot = mmc_priv(mmc);
-	struct dw_mci *host = slot->host;
-
-	WARN_ON(slot->mrq);
-
-	if (!test_bit(DW_MMC_CARD_PRESENT, &slot->flags)) {
-		mrq->cmd->error = -ENOMEDIUM;
-		mmc_request_done(mmc, mrq);
-		return;
-	}
-
-	/* We don't support multiple blocks of weird lengths. */
-	dw_mci_queue_request(host, slot, mrq);
-}
-
-static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
-{
-	struct dw_mci_slot *slot = mmc_priv(mmc);
-
-	/* set default 1 bit mode */
-	slot->ctype = SDMMC_CTYPE_1BIT;
-
-	switch (ios->bus_width) {
-	case MMC_BUS_WIDTH_1:
-		slot->ctype = SDMMC_CTYPE_1BIT;
-		break;
-	case MMC_BUS_WIDTH_4:
-		slot->ctype = SDMMC_CTYPE_4BIT;
-		break;
-	}
-
-	if (ios->clock) {
-		/*
-		 * Use mirror of ios->clock to prevent race with mmc
-		 * core ios update when finding the minimum.
-		 */
-		slot->clock = ios->clock;
-	}
-
-	switch (ios->power_mode) {
-	case MMC_POWER_UP:
-		set_bit(DW_MMC_CARD_NEED_INIT, &slot->flags);
-		break;
-	default:
-		break;
-	}
-}
-
-static int dw_mci_get_ro(struct mmc_host *mmc)
-{
-	int read_only;
-	struct dw_mci_slot *slot = mmc_priv(mmc);
-	struct dw_mci_board *brd = slot->host->pdata;
-
-	/* Use platform get_ro function, else try on board write protect */
-	if (brd->get_ro)
-		read_only = brd->get_ro(slot->id);
-	else
-		read_only =
-			mci_readl(slot->host, WRTPRT) & (1 << slot->id) ? 1 : 0;
-
-	dev_dbg(&mmc->class_dev, "card is %s\n",
-		read_only ? "read-only" : "read-write");
-
-	return read_only;
-}
-
-static int dw_mci_get_cd(struct mmc_host *mmc)
-{
-	int present;
-	struct dw_mci_slot *slot = mmc_priv(mmc);
-	struct dw_mci_board *brd = slot->host->pdata;
-
-	/* Use platform get_cd function, else try onboard card detect */
-	if (brd->get_cd)
-		present = !brd->get_cd(slot->id);
-	else
-		present = (mci_readl(slot->host, CDETECT) & (1 << slot->id))
-			== 0 ? 1 : 0;
-
-	if (present)
-		dev_dbg(&mmc->class_dev, "card is present\n");
-	else
-		dev_dbg(&mmc->class_dev, "card is not present\n");
-
-	return present;
-}
-
-static const struct mmc_host_ops dw_mci_ops = {
-	.request	= dw_mci_request,
-	.set_ios	= dw_mci_set_ios,
-	.get_ro		= dw_mci_get_ro,
-	.get_cd		= dw_mci_get_cd,
-};
-
-static void dw_mci_request_end(struct dw_mci *host, struct mmc_request *mrq)
-	__releases(&host->lock)
-	__acquires(&host->lock)
-{
-	struct dw_mci_slot *slot;
-	struct mmc_host	*prev_mmc = host->cur_slot->mmc;
-
-	WARN_ON(host->cmd || host->data);
-
-	host->cur_slot->mrq = NULL;
-	host->mrq = NULL;
-	if (!list_empty(&host->queue)) {
-		slot = list_entry(host->queue.next,
-				  struct dw_mci_slot, queue_node);
-		list_del(&slot->queue_node);
-		dev_vdbg(&host->pdev->dev, "list not empty: %s is next\n",
-			 mmc_hostname(slot->mmc));
-		host->state = STATE_SENDING_CMD;
-		dw_mci_start_request(host, slot);
-	} else {
-		dev_vdbg(&host->pdev->dev, "list empty\n");
-		host->state = STATE_IDLE;
-	}
-
-	spin_unlock(&host->lock);
-	mmc_request_done(prev_mmc, mrq);
-	spin_lock(&host->lock);
-}
-
-static void dw_mci_command_complete(struct dw_mci *host, struct mmc_command *cmd)
-{
-	u32 status = host->cmd_status;
-
-	host->cmd_status = 0;
-
-	/* Read the response from the card (up to 16 bytes) */
-	if (cmd->flags & MMC_RSP_PRESENT) {
-		if (cmd->flags & MMC_RSP_136) {
-			cmd->resp[3] = mci_readl(host, RESP0);
-			cmd->resp[2] = mci_readl(host, RESP1);
-			cmd->resp[1] = mci_readl(host, RESP2);
-			cmd->resp[0] = mci_readl(host, RESP3);
-		} else {
-			cmd->resp[0] = mci_readl(host, RESP0);
-			cmd->resp[1] = 0;
-			cmd->resp[2] = 0;
-			cmd->resp[3] = 0;
-		}
-	}
-
-	if (status & SDMMC_INT_RTO)
-		cmd->error = -ETIMEDOUT;
-	else if ((cmd->flags & MMC_RSP_CRC) && (status & SDMMC_INT_RCRC))
-		cmd->error = -EILSEQ;
-	else if (status & SDMMC_INT_RESP_ERR)
-		cmd->error = -EIO;
-	else
-		cmd->error = 0;
-
-	if (cmd->error) {
-		/* newer ip versions need a delay between retries */
-		if (host->quirks & DW_MCI_QUIRK_RETRY_DELAY)
-			mdelay(20);
-
-		if (cmd->data) {
-			host->data = NULL;
-			dw_mci_stop_dma(host);
-		}
-	}
-}
-
-static void dw_mci_tasklet_func(unsigned long priv)
-{
-	struct dw_mci *host = (struct dw_mci *)priv;
-	struct mmc_data	*data;
-	struct mmc_command *cmd;
-	enum dw_mci_state state;
-	enum dw_mci_state prev_state;
-	u32 status;
-
-	spin_lock(&host->lock);
-
-	state = host->state;
-	data = host->data;
-
-	do {
-		prev_state = state;
-
-		switch (state) {
-		case STATE_IDLE:
-			break;
-
-		case STATE_SENDING_CMD:
-			if (!test_and_clear_bit(EVENT_CMD_COMPLETE,
-						&host->pending_events))
-				break;
-
-			cmd = host->cmd;
-			host->cmd = NULL;
-			set_bit(EVENT_CMD_COMPLETE, &host->completed_events);
-			dw_mci_command_complete(host, host->mrq->cmd);
-			if (!host->mrq->data || cmd->error) {
-				dw_mci_request_end(host, host->mrq);
-				goto unlock;
-			}
-
-			prev_state = state = STATE_SENDING_DATA;
-			/* fall through */
-
-		case STATE_SENDING_DATA:
-			if (test_and_clear_bit(EVENT_DATA_ERROR,
-					       &host->pending_events)) {
-				dw_mci_stop_dma(host);
-				if (data->stop)
-					send_stop_cmd(host, data);
-				state = STATE_DATA_ERROR;
-				break;
-			}
-
-			if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
-						&host->pending_events))
-				break;
-
-			set_bit(EVENT_XFER_COMPLETE, &host->completed_events);
-			prev_state = state = STATE_DATA_BUSY;
-			/* fall through */
-
-		case STATE_DATA_BUSY:
-			if (!test_and_clear_bit(EVENT_DATA_COMPLETE,
-						&host->pending_events))
-				break;
-
-			host->data = NULL;
-			set_bit(EVENT_DATA_COMPLETE, &host->completed_events);
-			status = host->data_status;
-
-			if (status & DW_MCI_DATA_ERROR_FLAGS) {
-				if (status & SDMMC_INT_DTO) {
-					dev_err(&host->pdev->dev,
-						"data timeout error\n");
-					data->error = -ETIMEDOUT;
-				} else if (status & SDMMC_INT_DCRC) {
-					dev_err(&host->pdev->dev,
-						"data CRC error\n");
-					data->error = -EILSEQ;
-				} else {
-					dev_err(&host->pdev->dev,
-						"data FIFO error "
-						"(status=%08x)\n",
-						status);
-					data->error = -EIO;
-				}
-			} else {
-				data->bytes_xfered = data->blocks * data->blksz;
-				data->error = 0;
-			}
-
-			if (!data->stop) {
-				dw_mci_request_end(host, host->mrq);
-				goto unlock;
-			}
-
-			prev_state = state = STATE_SENDING_STOP;
-			if (!data->error)
-				send_stop_cmd(host, data);
-			/* fall through */
-
-		case STATE_SENDING_STOP:
-			if (!test_and_clear_bit(EVENT_CMD_COMPLETE,
-						&host->pending_events))
-				break;
-
-			host->cmd = NULL;
-			dw_mci_command_complete(host, host->mrq->stop);
-			dw_mci_request_end(host, host->mrq);
-			goto unlock;
-
-		case STATE_DATA_ERROR:
-			if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
-						&host->pending_events))
-				break;
-
-			state = STATE_DATA_BUSY;
-			break;
-		}
-	} while (state != prev_state);
-
-	host->state = state;
-unlock:
-	spin_unlock(&host->lock);
-
-}
-
-static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
-{
-	u16 *pdata = (u16 *)buf;
-
-	WARN_ON(cnt % 2 != 0);
-
-	cnt = cnt >> 1;
-	while (cnt > 0) {
-		mci_writew(host, DATA, *pdata++);
-		cnt--;
-	}
-}
-
-static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
-{
-	u16 *pdata = (u16 *)buf;
-
-	WARN_ON(cnt % 2 != 0);
-
-	cnt = cnt >> 1;
-	while (cnt > 0) {
-		*pdata++ = mci_readw(host, DATA);
-		cnt--;
-	}
-}
-
-static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt)
-{
-	u32 *pdata = (u32 *)buf;
-
-	WARN_ON(cnt % 4 != 0);
-	WARN_ON((unsigned long)pdata & 0x3);
-
-	cnt = cnt >> 2;
-	while (cnt > 0) {
-		mci_writel(host, DATA, *pdata++);
-		cnt--;
-	}
-}
-
-static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt)
-{
-	u32 *pdata = (u32 *)buf;
-
-	WARN_ON(cnt % 4 != 0);
-	WARN_ON((unsigned long)pdata & 0x3);
-
-	cnt = cnt >> 2;
-	while (cnt > 0) {
-		*pdata++ = mci_readl(host, DATA);
-		cnt--;
-	}
-}
-
-static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt)
-{
-	u64 *pdata = (u64 *)buf;
-
-	WARN_ON(cnt % 8 != 0);
-
-	cnt = cnt >> 3;
-	while (cnt > 0) {
-		mci_writeq(host, DATA, *pdata++);
-		cnt--;
-	}
-}
-
-static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt)
-{
-	u64 *pdata = (u64 *)buf;
-
-	WARN_ON(cnt % 8 != 0);
-
-	cnt = cnt >> 3;
-	while (cnt > 0) {
-		*pdata++ = mci_readq(host, DATA);
-		cnt--;
-	}
-}
-
-static void dw_mci_read_data_pio(struct dw_mci *host)
-{
-	struct scatterlist *sg = host->sg;
-	void *buf = sg_virt(sg);
-	unsigned int offset = host->pio_offset;
-	struct mmc_data	*data = host->data;
-	int shift = host->data_shift;
-	u32 status;
-	unsigned int nbytes = 0, len, old_len, count = 0;
-
-	do {
-		len = SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift;
-		if (count == 0)
-			old_len = len;
-
-		if (offset + len <= sg->length) {
-			host->pull_data(host, (void *)(buf + offset), len);
-
-			offset += len;
-			nbytes += len;
-
-			if (offset == sg->length) {
-				flush_dcache_page(sg_page(sg));
-				host->sg = sg = sg_next(sg);
-				if (!sg)
-					goto done;
-
-				offset = 0;
-				buf = sg_virt(sg);
-			}
-		} else {
-			unsigned int remaining = sg->length - offset;
-			host->pull_data(host, (void *)(buf + offset),
-					remaining);
-			nbytes += remaining;
-
-			flush_dcache_page(sg_page(sg));
-			host->sg = sg = sg_next(sg);
-			if (!sg)
-				goto done;
-
-			offset = len - remaining;
-			buf = sg_virt(sg);
-			host->pull_data(host, buf, offset);
-			nbytes += offset;
-		}
-
-		status = mci_readl(host, MINTSTS);
-		mci_writel(host, RINTSTS, SDMMC_INT_RXDR);
-		if (status & DW_MCI_DATA_ERROR_FLAGS) {
-			host->data_status = status;
-			data->bytes_xfered += nbytes;
-			smp_wmb();
-
-			set_bit(EVENT_DATA_ERROR, &host->pending_events);
-
-			tasklet_schedule(&host->tasklet);
-			return;
-		}
-		count++;
-	} while (status & SDMMC_INT_RXDR); /*if the RXDR is ready read again*/
-	len = SDMMC_GET_FCNT(mci_readl(host, STATUS));
-	host->pio_offset = offset;
-	data->bytes_xfered += nbytes;
-	return;
-
-done:
-	data->bytes_xfered += nbytes;
-	smp_wmb();
-	set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
-}
-
-static void dw_mci_write_data_pio(struct dw_mci *host)
-{
-	struct scatterlist *sg = host->sg;
-	void *buf = sg_virt(sg);
-	unsigned int offset = host->pio_offset;
-	struct mmc_data	*data = host->data;
-	int shift = host->data_shift;
-	u32 status;
-	unsigned int nbytes = 0, len;
-
-	do {
-		len = SDMMC_FIFO_SZ -
-			(SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift);
-		if (offset + len <= sg->length) {
-			host->push_data(host, (void *)(buf + offset), len);
-
-			offset += len;
-			nbytes += len;
-			if (offset == sg->length) {
-				host->sg = sg = sg_next(sg);
-				if (!sg)
-					goto done;
-
-				offset = 0;
-				buf = sg_virt(sg);
-			}
-		} else {
-			unsigned int remaining = sg->length - offset;
-
-			host->push_data(host, (void *)(buf + offset),
-					remaining);
-			nbytes += remaining;
-
-			host->sg = sg = sg_next(sg);
-			if (!sg)
-				goto done;
-
-			offset = len - remaining;
-			buf = sg_virt(sg);
-			host->push_data(host, (void *)buf, offset);
-			nbytes += offset;
-		}
-
-		status = mci_readl(host, MINTSTS);
-		mci_writel(host, RINTSTS, SDMMC_INT_TXDR);
-		if (status & DW_MCI_DATA_ERROR_FLAGS) {
-			host->data_status = status;
-			data->bytes_xfered += nbytes;
-
-			smp_wmb();
-
-			set_bit(EVENT_DATA_ERROR, &host->pending_events);
-
-			tasklet_schedule(&host->tasklet);
-			return;
-		}
-	} while (status & SDMMC_INT_TXDR); /* if TXDR write again */
-
-	host->pio_offset = offset;
-	data->bytes_xfered += nbytes;
-
-	return;
-
-done:
-	data->bytes_xfered += nbytes;
-	smp_wmb();
-	set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
-}
-
-static void dw_mci_cmd_interrupt(struct dw_mci *host, u32 status)
-{
-	if (!host->cmd_status)
-		host->cmd_status = status;
-
-	smp_wmb();
-
-	set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
-	tasklet_schedule(&host->tasklet);
-}
-
-static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
-{
-	struct dw_mci *host = dev_id;
-	u32 status, pending;
-	unsigned int pass_count = 0;
-
-	do {
-		status = mci_readl(host, RINTSTS);
-		pending = mci_readl(host, MINTSTS); /* read-only mask reg */
-
-		/*
-		 * DTO fix - version 2.10a and below, and only if internal DMA
-		 * is configured.
-		 */
-		if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO) {
-			if (!pending &&
-			    ((mci_readl(host, STATUS) >> 17) & 0x1fff))
-				pending |= SDMMC_INT_DATA_OVER;
-		}
-
-		if (!pending)
-			break;
-
-		if (pending & DW_MCI_CMD_ERROR_FLAGS) {
-			mci_writel(host, RINTSTS, DW_MCI_CMD_ERROR_FLAGS);
-			host->cmd_status = status;
-			smp_wmb();
-			set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
-			tasklet_schedule(&host->tasklet);
-		}
-
-		if (pending & DW_MCI_DATA_ERROR_FLAGS) {
-			/* if there is an error report DATA_ERROR */
-			mci_writel(host, RINTSTS, DW_MCI_DATA_ERROR_FLAGS);
-			host->data_status = status;
-			smp_wmb();
-			set_bit(EVENT_DATA_ERROR, &host->pending_events);
-			tasklet_schedule(&host->tasklet);
-		}
-
-		if (pending & SDMMC_INT_DATA_OVER) {
-			mci_writel(host, RINTSTS, SDMMC_INT_DATA_OVER);
-			if (!host->data_status)
-				host->data_status = status;
-			smp_wmb();
-			if (host->dir_status == DW_MCI_RECV_STATUS) {
-				if (host->sg != NULL)
-					dw_mci_read_data_pio(host);
-			}
-			set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
-			tasklet_schedule(&host->tasklet);
-		}
-
-		if (pending & SDMMC_INT_RXDR) {
-			mci_writel(host, RINTSTS, SDMMC_INT_RXDR);
-			if (host->sg)
-				dw_mci_read_data_pio(host);
-		}
-
-		if (pending & SDMMC_INT_TXDR) {
-			mci_writel(host, RINTSTS, SDMMC_INT_TXDR);
-			if (host->sg)
-				dw_mci_write_data_pio(host);
-		}
-
-		if (pending & SDMMC_INT_CMD_DONE) {
-			mci_writel(host, RINTSTS, SDMMC_INT_CMD_DONE);
-			dw_mci_cmd_interrupt(host, status);
-		}
-
-		if (pending & SDMMC_INT_CD) {
-			mci_writel(host, RINTSTS, SDMMC_INT_CD);
-			tasklet_schedule(&host->card_tasklet);
-		}
-
-	} while (pass_count++ < 5);
-
-#ifdef CONFIG_MMC_DW_IDMAC
-	/* Handle DMA interrupts */
-	pending = mci_readl(host, IDSTS);
-	if (pending & (SDMMC_IDMAC_INT_TI | SDMMC_IDMAC_INT_RI)) {
-		mci_writel(host, IDSTS, SDMMC_IDMAC_INT_TI | SDMMC_IDMAC_INT_RI);
-		mci_writel(host, IDSTS, SDMMC_IDMAC_INT_NI);
-		set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
-		host->dma_ops->complete(host);
-	}
-#endif
-
-	return IRQ_HANDLED;
-}
-
-static void dw_mci_tasklet_card(unsigned long data)
-{
-	struct dw_mci *host = (struct dw_mci *)data;
-	int i;
-
-	for (i = 0; i < host->num_slots; i++) {
-		struct dw_mci_slot *slot = host->slot[i];
-		struct mmc_host *mmc = slot->mmc;
-		struct mmc_request *mrq;
-		int present;
-		u32 ctrl;
-
-		present = dw_mci_get_cd(mmc);
-		while (present != slot->last_detect_state) {
-			spin_lock(&host->lock);
-
-			dev_dbg(&slot->mmc->class_dev, "card %s\n",
-				present ? "inserted" : "removed");
-
-			/* Card change detected */
-			slot->last_detect_state = present;
-
-			/* Power up slot */
-			if (present != 0) {
-				if (host->pdata->setpower)
-					host->pdata->setpower(slot->id,
-							      mmc->ocr_avail);
-
-				set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
-			}
-
-			/* Clean up queue if present */
-			mrq = slot->mrq;
-			if (mrq) {
-				if (mrq == host->mrq) {
-					host->data = NULL;
-					host->cmd = NULL;
-
-					switch (host->state) {
-					case STATE_IDLE:
-						break;
-					case STATE_SENDING_CMD:
-						mrq->cmd->error = -ENOMEDIUM;
-						if (!mrq->data)
-							break;
-						/* fall through */
-					case STATE_SENDING_DATA:
-						mrq->data->error = -ENOMEDIUM;
-						dw_mci_stop_dma(host);
-						break;
-					case STATE_DATA_BUSY:
-					case STATE_DATA_ERROR:
-						if (mrq->data->error == -EINPROGRESS)
-							mrq->data->error = -ENOMEDIUM;
-						if (!mrq->stop)
-							break;
-						/* fall through */
-					case STATE_SENDING_STOP:
-						mrq->stop->error = -ENOMEDIUM;
-						break;
-					}
-
-					dw_mci_request_end(host, mrq);
-				} else {
-					list_del(&slot->queue_node);
-					mrq->cmd->error = -ENOMEDIUM;
-					if (mrq->data)
-						mrq->data->error = -ENOMEDIUM;
-					if (mrq->stop)
-						mrq->stop->error = -ENOMEDIUM;
-
-					spin_unlock(&host->lock);
-					mmc_request_done(slot->mmc, mrq);
-					spin_lock(&host->lock);
-				}
-			}
-
-			/* Power down slot */
-			if (present == 0) {
-				if (host->pdata->setpower)
-					host->pdata->setpower(slot->id, 0);
-				clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
-
-				/*
-				 * Clear down the FIFO - doing so generates a
-				 * block interrupt, hence setting the
-				 * scatter-gather pointer to NULL.
-				 */
-				host->sg = NULL;
-
-				ctrl = mci_readl(host, CTRL);
-				ctrl |= SDMMC_CTRL_FIFO_RESET;
-				mci_writel(host, CTRL, ctrl);
-
-#ifdef CONFIG_MMC_DW_IDMAC
-				ctrl = mci_readl(host, BMOD);
-				ctrl |= 0x01; /* Software reset of DMA */
-				mci_writel(host, BMOD, ctrl);
-#endif
-
-			}
-
-			spin_unlock(&host->lock);
-			present = dw_mci_get_cd(mmc);
-		}
-
-		mmc_detect_change(slot->mmc,
-			msecs_to_jiffies(host->pdata->detect_delay_ms));
-	}
-}
-
-static int __init dw_mci_init_slot(struct dw_mci *host, unsigned int id)
-{
-	struct mmc_host *mmc;
-	struct dw_mci_slot *slot;
-
-	mmc = mmc_alloc_host(sizeof(struct dw_mci_slot), &host->pdev->dev);
-	if (!mmc)
-		return -ENOMEM;
-
-	slot = mmc_priv(mmc);
-	slot->id = id;
-	slot->mmc = mmc;
-	slot->host = host;
-
-	mmc->ops = &dw_mci_ops;
-	mmc->f_min = DIV_ROUND_UP(host->bus_hz, 510);
-	mmc->f_max = host->bus_hz;
-
-	if (host->pdata->get_ocr)
-		mmc->ocr_avail = host->pdata->get_ocr(id);
-	else
-		mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
-
-	/*
-	 * Start with slot power disabled, it will be enabled when a card
-	 * is detected.
-	 */
-	if (host->pdata->setpower)
-		host->pdata->setpower(id, 0);
-
-	mmc->caps = 0;
-	if (host->pdata->get_bus_wd)
-		if (host->pdata->get_bus_wd(slot->id) >= 4)
-			mmc->caps |= MMC_CAP_4_BIT_DATA;
-
-	if (host->pdata->quirks & DW_MCI_QUIRK_HIGHSPEED)
-		mmc->caps |= MMC_CAP_SD_HIGHSPEED;
-
-#ifdef CONFIG_MMC_DW_IDMAC
-	mmc->max_segs = host->ring_size;
-	mmc->max_blk_size = 65536;
-	mmc->max_blk_count = host->ring_size;
-	mmc->max_seg_size = 0x1000;
-	mmc->max_req_size = mmc->max_seg_size * mmc->max_blk_count;
-#else
-	if (host->pdata->blk_settings) {
-		mmc->max_segs = host->pdata->blk_settings->max_segs;
-		mmc->max_blk_size = host->pdata->blk_settings->max_blk_size;
-		mmc->max_blk_count = host->pdata->blk_settings->max_blk_count;
-		mmc->max_req_size = host->pdata->blk_settings->max_req_size;
-		mmc->max_seg_size = host->pdata->blk_settings->max_seg_size;
-	} else {
-		/* Useful defaults if platform data is unset. */
-		mmc->max_segs = 64;
-		mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
-		mmc->max_blk_count = 512;
-		mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
-		mmc->max_seg_size = mmc->max_req_size;
-	}
-#endif /* CONFIG_MMC_DW_IDMAC */
-
-	if (dw_mci_get_cd(mmc))
-		set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
-	else
-		clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
-
-	host->slot[id] = slot;
-	mmc_add_host(mmc);
-
-#if defined(CONFIG_DEBUG_FS)
-	dw_mci_init_debugfs(slot);
-#endif
-
-	/* Card initially undetected */
-	slot->last_detect_state = 0;
-
-	return 0;
-}
-
-static void dw_mci_cleanup_slot(struct dw_mci_slot *slot, unsigned int id)
-{
-	/* Shutdown detect IRQ */
-	if (slot->host->pdata->exit)
-		slot->host->pdata->exit(id);
-
-	/* Debugfs stuff is cleaned up by mmc core */
-	mmc_remove_host(slot->mmc);
-	slot->host->slot[id] = NULL;
-	mmc_free_host(slot->mmc);
-}
-
-static void dw_mci_init_dma(struct dw_mci *host)
-{
-	/* Alloc memory for sg translation */
-	host->sg_cpu = dma_alloc_coherent(&host->pdev->dev, PAGE_SIZE,
-					  &host->sg_dma, GFP_KERNEL);
-	if (!host->sg_cpu) {
-		dev_err(&host->pdev->dev, "%s: could not alloc DMA memory\n",
-			__func__);
-		goto no_dma;
-	}
-
-	/* Determine which DMA interface to use */
-#ifdef CONFIG_MMC_DW_IDMAC
-	host->dma_ops = &dw_mci_idmac_ops;
-	dev_info(&host->pdev->dev, "Using internal DMA controller.\n");
-#endif
-
-	if (!host->dma_ops)
-		goto no_dma;
-
-	if (host->dma_ops->init) {
-		if (host->dma_ops->init(host)) {
-			dev_err(&host->pdev->dev, "%s: Unable to initialize "
-				"DMA Controller.\n", __func__);
-			goto no_dma;
-		}
-	} else {
-		dev_err(&host->pdev->dev, "DMA initialization not found.\n");
-		goto no_dma;
-	}
-
-	host->use_dma = 1;
-	return;
-
-no_dma:
-	dev_info(&host->pdev->dev, "Using PIO mode.\n");
-	host->use_dma = 0;
-	return;
-}
-
-static bool mci_wait_reset(struct device *dev, struct dw_mci *host)
-{
-	unsigned long timeout = jiffies + msecs_to_jiffies(500);
-	unsigned int ctrl;
-
-	mci_writel(host, CTRL, (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET |
-				SDMMC_CTRL_DMA_RESET));
-
-	/* wait till resets clear */
-	do {
-		ctrl = mci_readl(host, CTRL);
-		if (!(ctrl & (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET |
-			      SDMMC_CTRL_DMA_RESET)))
-			return true;
-	} while (time_before(jiffies, timeout));
-
-	dev_err(dev, "Timeout resetting block (ctrl %#x)\n", ctrl);
-
-	return false;
-}
-
-static int dw_mci_probe(struct platform_device *pdev)
-{
-	struct dw_mci *host;
-	struct resource	*regs;
-	struct dw_mci_board *pdata;
-	int irq, ret, i, width;
-	u32 fifo_size;
-
-	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!regs)
-		return -ENXIO;
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
-
-	host = kzalloc(sizeof(struct dw_mci), GFP_KERNEL);
-	if (!host)
-		return -ENOMEM;
-
-	host->pdev = pdev;
-	host->pdata = pdata = pdev->dev.platform_data;
-	if (!pdata || !pdata->init) {
-		dev_err(&pdev->dev,
-			"Platform data must supply init function\n");
-		ret = -ENODEV;
-		goto err_freehost;
-	}
-
-	if (!pdata->select_slot && pdata->num_slots > 1) {
-		dev_err(&pdev->dev,
-			"Platform data must supply select_slot function\n");
-		ret = -ENODEV;
-		goto err_freehost;
-	}
-
-	if (!pdata->bus_hz) {
-		dev_err(&pdev->dev,
-			"Platform data must supply bus speed\n");
-		ret = -ENODEV;
-		goto err_freehost;
-	}
-
-	host->bus_hz = pdata->bus_hz;
-	host->quirks = pdata->quirks;
-
-	spin_lock_init(&host->lock);
-	INIT_LIST_HEAD(&host->queue);
-
-	ret = -ENOMEM;
-	host->regs = ioremap(regs->start, regs->end - regs->start + 1);
-	if (!host->regs)
-		goto err_freehost;
-
-	host->dma_ops = pdata->dma_ops;
-	dw_mci_init_dma(host);
-
-	/*
-	 * Get the host data width - this assumes that HCON has been set with
-	 * the correct values.
-	 */
-	i = (mci_readl(host, HCON) >> 7) & 0x7;
-	if (!i) {
-		host->push_data = dw_mci_push_data16;
-		host->pull_data = dw_mci_pull_data16;
-		width = 16;
-		host->data_shift = 1;
-	} else if (i == 2) {
-		host->push_data = dw_mci_push_data64;
-		host->pull_data = dw_mci_pull_data64;
-		width = 64;
-		host->data_shift = 3;
-	} else {
-		/* Check for a reserved value, and warn if it is */
-		WARN((i != 1),
-		     "HCON reports a reserved host data width!\n"
-		     "Defaulting to 32-bit access.\n");
-		host->push_data = dw_mci_push_data32;
-		host->pull_data = dw_mci_pull_data32;
-		width = 32;
-		host->data_shift = 2;
-	}
-
-	/* Reset all blocks */
-	if (!mci_wait_reset(&pdev->dev, host)) {
-		ret = -ENODEV;
-		goto err_dmaunmap;
-	}
-
-	/* Clear the interrupts for the host controller */
-	mci_writel(host, RINTSTS, 0xFFFFFFFF);
-	mci_writel(host, INTMASK, 0); /* disable all mmc interrupt first */
-
-	/* Put in max timeout */
-	mci_writel(host, TMOUT, 0xFFFFFFFF);
-
-	/*
-	 * FIFO threshold settings  RxMark  = fifo_size / 2 - 1,
-	 *                          Tx Mark = fifo_size / 2 DMA Size = 8
-	 */
-	fifo_size = mci_readl(host, FIFOTH);
-	fifo_size = (fifo_size >> 16) & 0x7ff;
-	mci_writel(host, FIFOTH, ((0x2 << 28) | ((fifo_size/2 - 1) << 16) |
-				  ((fifo_size/2) << 0)));
-
-	/* disable clock to CIU */
-	mci_writel(host, CLKENA, 0);
-	mci_writel(host, CLKSRC, 0);
-
-	tasklet_init(&host->tasklet, dw_mci_tasklet_func, (unsigned long)host);
-	tasklet_init(&host->card_tasklet,
-		     dw_mci_tasklet_card, (unsigned long)host);
-
-	ret = request_irq(irq, dw_mci_interrupt, 0, "dw-mci", host);
-	if (ret)
-		goto err_dmaunmap;
-
-	platform_set_drvdata(pdev, host);
-
-	if (host->pdata->num_slots)
-		host->num_slots = host->pdata->num_slots;
-	else
-		host->num_slots = ((mci_readl(host, HCON) >> 1) & 0x1F) + 1;
-
-	/* We need at least one slot to succeed */
-	for (i = 0; i < host->num_slots; i++) {
-		ret = dw_mci_init_slot(host, i);
-		if (ret) {
-			ret = -ENODEV;
-			goto err_init_slot;
-		}
-	}
-
-	/*
-	 * Enable interrupts for command done, data over, data empty, card det,
-	 * receive ready and error such as transmit, receive timeout, crc error
-	 */
-	mci_writel(host, RINTSTS, 0xFFFFFFFF);
-	mci_writel(host, INTMASK, SDMMC_INT_CMD_DONE | SDMMC_INT_DATA_OVER |
-		   SDMMC_INT_TXDR | SDMMC_INT_RXDR |
-		   DW_MCI_ERROR_FLAGS | SDMMC_INT_CD);
-	mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE); /* Enable mci interrupt */
-
-	dev_info(&pdev->dev, "DW MMC controller at irq %d, "
-		 "%d bit host data width\n", irq, width);
-	if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO)
-		dev_info(&pdev->dev, "Internal DMAC interrupt fix enabled.\n");
-
-	return 0;
-
-err_init_slot:
-	/* De-init any initialized slots */
-	while (i > 0) {
-		if (host->slot[i])
-			dw_mci_cleanup_slot(host->slot[i], i);
-		i--;
-	}
-	free_irq(irq, host);
-
-err_dmaunmap:
-	if (host->use_dma && host->dma_ops->exit)
-		host->dma_ops->exit(host);
-	dma_free_coherent(&host->pdev->dev, PAGE_SIZE,
-			  host->sg_cpu, host->sg_dma);
-	iounmap(host->regs);
-
-err_freehost:
-	kfree(host);
-	return ret;
-}
-
-static int __exit dw_mci_remove(struct platform_device *pdev)
-{
-	struct dw_mci *host = platform_get_drvdata(pdev);
-	int i;
-
-	mci_writel(host, RINTSTS, 0xFFFFFFFF);
-	mci_writel(host, INTMASK, 0); /* disable all mmc interrupt first */
-
-	platform_set_drvdata(pdev, NULL);
-
-	for (i = 0; i < host->num_slots; i++) {
-		dev_dbg(&pdev->dev, "remove slot %d\n", i);
-		if (host->slot[i])
-			dw_mci_cleanup_slot(host->slot[i], i);
-	}
-
-	/* disable clock to CIU */
-	mci_writel(host, CLKENA, 0);
-	mci_writel(host, CLKSRC, 0);
-
-	free_irq(platform_get_irq(pdev, 0), host);
-	dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
-
-	if (host->use_dma && host->dma_ops->exit)
-		host->dma_ops->exit(host);
-
-	iounmap(host->regs);
-
-	kfree(host);
-	return 0;
-}
-
-#ifdef CONFIG_PM
-/*
- * TODO: we should probably disable the clock to the card in the suspend path.
- */
-static int dw_mci_suspend(struct platform_device *pdev, pm_message_t mesg)
-{
-	int i, ret;
-	struct dw_mci *host = platform_get_drvdata(pdev);
-
-	for (i = 0; i < host->num_slots; i++) {
-		struct dw_mci_slot *slot = host->slot[i];
-		if (!slot)
-			continue;
-		ret = mmc_suspend_host(slot->mmc);
-		if (ret < 0) {
-			while (--i >= 0) {
-				slot = host->slot[i];
-				if (slot)
-					mmc_resume_host(host->slot[i]->mmc);
-			}
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static int dw_mci_resume(struct platform_device *pdev)
-{
-	int i, ret;
-	struct dw_mci *host = platform_get_drvdata(pdev);
-
-	for (i = 0; i < host->num_slots; i++) {
-		struct dw_mci_slot *slot = host->slot[i];
-		if (!slot)
-			continue;
-		ret = mmc_resume_host(host->slot[i]->mmc);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
-}
-#else
-#define dw_mci_suspend	NULL
-#define dw_mci_resume	NULL
-#endif /* CONFIG_PM */
-
-static struct platform_driver dw_mci_driver = {
-	.remove		= __exit_p(dw_mci_remove),
-	.suspend	= dw_mci_suspend,
-	.resume		= dw_mci_resume,
-	.driver		= {
-		.name		= "dw_mmc",
-	},
-};
-
-static int __init dw_mci_init(void)
-{
-	return platform_driver_probe(&dw_mci_driver, dw_mci_probe);
-}
-
-static void __exit dw_mci_exit(void)
-{
-	platform_driver_unregister(&dw_mci_driver);
-}
-
-module_init(dw_mci_init);
-module_exit(dw_mci_exit);
-
-MODULE_DESCRIPTION("DW Multimedia Card Interface driver");
-MODULE_AUTHOR("NXP Semiconductor VietNam");
-MODULE_AUTHOR("Imagination Technologies Ltd");
-MODULE_LICENSE("GPL v2");
diff --git a/trunk/drivers/mmc/host/dw_mmc.h b/trunk/drivers/mmc/host/dw_mmc.h
deleted file mode 100644
index 5dd55a75233d..000000000000
--- a/trunk/drivers/mmc/host/dw_mmc.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Synopsys DesignWare Multimedia Card Interface driver
- *  (Based on NXP driver for lpc 31xx)
- *
- * Copyright (C) 2009 NXP Semiconductors
- * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _DW_MMC_H_
-#define _DW_MMC_H_
-
-#define SDMMC_CTRL		0x000
-#define SDMMC_PWREN		0x004
-#define SDMMC_CLKDIV		0x008
-#define SDMMC_CLKSRC		0x00c
-#define SDMMC_CLKENA		0x010
-#define SDMMC_TMOUT		0x014
-#define SDMMC_CTYPE		0x018
-#define SDMMC_BLKSIZ		0x01c
-#define SDMMC_BYTCNT		0x020
-#define SDMMC_INTMASK		0x024
-#define SDMMC_CMDARG		0x028
-#define SDMMC_CMD		0x02c
-#define SDMMC_RESP0		0x030
-#define SDMMC_RESP1		0x034
-#define SDMMC_RESP2		0x038
-#define SDMMC_RESP3		0x03c
-#define SDMMC_MINTSTS		0x040
-#define SDMMC_RINTSTS		0x044
-#define SDMMC_STATUS		0x048
-#define SDMMC_FIFOTH		0x04c
-#define SDMMC_CDETECT		0x050
-#define SDMMC_WRTPRT		0x054
-#define SDMMC_GPIO		0x058
-#define SDMMC_TCBCNT		0x05c
-#define SDMMC_TBBCNT		0x060
-#define SDMMC_DEBNCE		0x064
-#define SDMMC_USRID		0x068
-#define SDMMC_VERID		0x06c
-#define SDMMC_HCON		0x070
-#define SDMMC_BMOD		0x080
-#define SDMMC_PLDMND		0x084
-#define SDMMC_DBADDR		0x088
-#define SDMMC_IDSTS		0x08c
-#define SDMMC_IDINTEN		0x090
-#define SDMMC_DSCADDR		0x094
-#define SDMMC_BUFADDR		0x098
-#define SDMMC_DATA		0x100
-#define SDMMC_DATA_ADR		0x100
-
-/* shift bit field */
-#define _SBF(f, v)		((v) << (f))
-
-/* Control register defines */
-#define SDMMC_CTRL_USE_IDMAC		BIT(25)
-#define SDMMC_CTRL_CEATA_INT_EN		BIT(11)
-#define SDMMC_CTRL_SEND_AS_CCSD		BIT(10)
-#define SDMMC_CTRL_SEND_CCSD		BIT(9)
-#define SDMMC_CTRL_ABRT_READ_DATA	BIT(8)
-#define SDMMC_CTRL_SEND_IRQ_RESP	BIT(7)
-#define SDMMC_CTRL_READ_WAIT		BIT(6)
-#define SDMMC_CTRL_DMA_ENABLE		BIT(5)
-#define SDMMC_CTRL_INT_ENABLE		BIT(4)
-#define SDMMC_CTRL_DMA_RESET		BIT(2)
-#define SDMMC_CTRL_FIFO_RESET		BIT(1)
-#define SDMMC_CTRL_RESET		BIT(0)
-/* Clock Enable register defines */
-#define SDMMC_CLKEN_LOW_PWR		BIT(16)
-#define SDMMC_CLKEN_ENABLE		BIT(0)
-/* time-out register defines */
-#define SDMMC_TMOUT_DATA(n)		_SBF(8, (n))
-#define SDMMC_TMOUT_DATA_MSK		0xFFFFFF00
-#define SDMMC_TMOUT_RESP(n)		((n) & 0xFF)
-#define SDMMC_TMOUT_RESP_MSK		0xFF
-/* card-type register defines */
-#define SDMMC_CTYPE_8BIT		BIT(16)
-#define SDMMC_CTYPE_4BIT		BIT(0)
-#define SDMMC_CTYPE_1BIT		0
-/* Interrupt status & mask register defines */
-#define SDMMC_INT_SDIO			BIT(16)
-#define SDMMC_INT_EBE			BIT(15)
-#define SDMMC_INT_ACD			BIT(14)
-#define SDMMC_INT_SBE			BIT(13)
-#define SDMMC_INT_HLE			BIT(12)
-#define SDMMC_INT_FRUN			BIT(11)
-#define SDMMC_INT_HTO			BIT(10)
-#define SDMMC_INT_DTO			BIT(9)
-#define SDMMC_INT_RTO			BIT(8)
-#define SDMMC_INT_DCRC			BIT(7)
-#define SDMMC_INT_RCRC			BIT(6)
-#define SDMMC_INT_RXDR			BIT(5)
-#define SDMMC_INT_TXDR			BIT(4)
-#define SDMMC_INT_DATA_OVER		BIT(3)
-#define SDMMC_INT_CMD_DONE		BIT(2)
-#define SDMMC_INT_RESP_ERR		BIT(1)
-#define SDMMC_INT_CD			BIT(0)
-#define SDMMC_INT_ERROR			0xbfc2
-/* Command register defines */
-#define SDMMC_CMD_START			BIT(31)
-#define SDMMC_CMD_CCS_EXP		BIT(23)
-#define SDMMC_CMD_CEATA_RD		BIT(22)
-#define SDMMC_CMD_UPD_CLK		BIT(21)
-#define SDMMC_CMD_INIT			BIT(15)
-#define SDMMC_CMD_STOP			BIT(14)
-#define SDMMC_CMD_PRV_DAT_WAIT		BIT(13)
-#define SDMMC_CMD_SEND_STOP		BIT(12)
-#define SDMMC_CMD_STRM_MODE		BIT(11)
-#define SDMMC_CMD_DAT_WR		BIT(10)
-#define SDMMC_CMD_DAT_EXP		BIT(9)
-#define SDMMC_CMD_RESP_CRC		BIT(8)
-#define SDMMC_CMD_RESP_LONG		BIT(7)
-#define SDMMC_CMD_RESP_EXP		BIT(6)
-#define SDMMC_CMD_INDX(n)		((n) & 0x1F)
-/* Status register defines */
-#define SDMMC_GET_FCNT(x)		(((x)>>17) & 0x1FF)
-#define SDMMC_FIFO_SZ			32
-/* Internal DMAC interrupt defines */
-#define SDMMC_IDMAC_INT_AI		BIT(9)
-#define SDMMC_IDMAC_INT_NI		BIT(8)
-#define SDMMC_IDMAC_INT_CES		BIT(5)
-#define SDMMC_IDMAC_INT_DU		BIT(4)
-#define SDMMC_IDMAC_INT_FBE		BIT(2)
-#define SDMMC_IDMAC_INT_RI		BIT(1)
-#define SDMMC_IDMAC_INT_TI		BIT(0)
-/* Internal DMAC bus mode bits */
-#define SDMMC_IDMAC_ENABLE		BIT(7)
-#define SDMMC_IDMAC_FB			BIT(1)
-#define SDMMC_IDMAC_SWRESET		BIT(0)
-
-/* Register access macros */
-#define mci_readl(dev, reg)			\
-	__raw_readl(dev->regs + SDMMC_##reg)
-#define mci_writel(dev, reg, value)			\
-	__raw_writel((value), dev->regs + SDMMC_##reg)
-
-/* 16-bit FIFO access macros */
-#define mci_readw(dev, reg)			\
-	__raw_readw(dev->regs + SDMMC_##reg)
-#define mci_writew(dev, reg, value)			\
-	__raw_writew((value), dev->regs + SDMMC_##reg)
-
-/* 64-bit FIFO access macros */
-#ifdef readq
-#define mci_readq(dev, reg)			\
-	__raw_readq(dev->regs + SDMMC_##reg)
-#define mci_writeq(dev, reg, value)			\
-	__raw_writeq((value), dev->regs + SDMMC_##reg)
-#else
-/*
- * Dummy readq implementation for architectures that don't define it.
- *
- * We would assume that none of these architectures would configure
- * the IP block with a 64bit FIFO width, so this code will never be
- * executed on those machines. Defining these macros here keeps the
- * rest of the code free from ifdefs.
- */
-#define mci_readq(dev, reg)			\
-	(*(volatile u64 __force *)(dev->regs + SDMMC_##reg))
-#define mci_writeq(dev, reg, value)			\
-	(*(volatile u64 __force *)(dev->regs + SDMMC_##reg) = value)
-#endif
-
-#endif /* _DW_MMC_H_ */
diff --git a/trunk/drivers/mmc/host/mxcmmc.c b/trunk/drivers/mmc/host/mxcmmc.c
index 4428594261c5..bdd2cbb87cba 100644
--- a/trunk/drivers/mmc/host/mxcmmc.c
+++ b/trunk/drivers/mmc/host/mxcmmc.c
@@ -31,7 +31,6 @@
 #include <linux/clk.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
-#include <linux/regulator/consumer.h>
 
 #include <asm/dma.h>
 #include <asm/irq.h>
@@ -142,49 +141,10 @@ struct mxcmci_host {
 
 	struct work_struct	datawork;
 	spinlock_t		lock;
-
-	struct regulator	*vcc;
 };
 
 static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios);
 
-static inline void mxcmci_init_ocr(struct mxcmci_host *host)
-{
-	host->vcc = regulator_get(mmc_dev(host->mmc), "vmmc");
-
-	if (IS_ERR(host->vcc)) {
-		host->vcc = NULL;
-	} else {
-		host->mmc->ocr_avail = mmc_regulator_get_ocrmask(host->vcc);
-		if (host->pdata && host->pdata->ocr_avail)
-			dev_warn(mmc_dev(host->mmc),
-				"pdata->ocr_avail will not be used\n");
-	}
-
-	if (host->vcc == NULL) {
-		/* fall-back to platform data */
-		if (host->pdata && host->pdata->ocr_avail)
-			host->mmc->ocr_avail = host->pdata->ocr_avail;
-		else
-			host->mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
-	}
-}
-
-static inline void mxcmci_set_power(struct mxcmci_host *host,
-				    unsigned char power_mode,
-				    unsigned int vdd)
-{
-	if (host->vcc) {
-		if (power_mode == MMC_POWER_UP)
-			mmc_regulator_set_ocr(host->mmc, host->vcc, vdd);
-		else if (power_mode == MMC_POWER_OFF)
-			mmc_regulator_set_ocr(host->mmc, host->vcc, 0);
-	}
-
-	if (host->pdata && host->pdata->setpower)
-		host->pdata->setpower(mmc_dev(host->mmc), vdd);
-}
-
 static inline int mxcmci_use_dma(struct mxcmci_host *host)
 {
 	return host->do_dma;
@@ -720,9 +680,9 @@ static void mxcmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		host->cmdat &= ~CMD_DAT_CONT_BUS_WIDTH_4;
 
 	if (host->power_mode != ios->power_mode) {
-		mxcmci_set_power(host, ios->power_mode, ios->vdd);
+		if (host->pdata && host->pdata->setpower)
+			host->pdata->setpower(mmc_dev(mmc), ios->vdd);
 		host->power_mode = ios->power_mode;
-
 		if (ios->power_mode == MMC_POWER_ON)
 			host->cmdat |= CMD_DAT_CONT_INIT;
 	}
@@ -847,7 +807,10 @@ static int mxcmci_probe(struct platform_device *pdev)
 	host->pdata = pdev->dev.platform_data;
 	spin_lock_init(&host->lock);
 
-	mxcmci_init_ocr(host);
+	if (host->pdata && host->pdata->ocr_avail)
+		mmc->ocr_avail = host->pdata->ocr_avail;
+	else
+		mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
 
 	if (host->pdata && host->pdata->dat3_card_detect)
 		host->default_irq_mask =
@@ -952,9 +915,6 @@ static int mxcmci_remove(struct platform_device *pdev)
 
 	mmc_remove_host(mmc);
 
-	if (host->vcc)
-		regulator_put(host->vcc);
-
 	if (host->pdata && host->pdata->exit)
 		host->pdata->exit(&pdev->dev, mmc);
 
@@ -967,6 +927,7 @@ static int mxcmci_remove(struct platform_device *pdev)
 	clk_put(host->clk);
 
 	release_mem_region(host->res->start, resource_size(host->res));
+	release_resource(host->res);
 
 	mmc_free_host(mmc);
 
diff --git a/trunk/drivers/mmc/host/sdhci-dove.c b/trunk/drivers/mmc/host/sdhci-dove.c
deleted file mode 100644
index 2aeef4ffed8c..000000000000
--- a/trunk/drivers/mmc/host/sdhci-dove.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * sdhci-dove.c Support for SDHCI on Marvell's Dove SoC
- *
- * Author: Saeed Bishara <saeed@marvell.com>
- *	   Mike Rapoport <mike@compulab.co.il>
- * Based on sdhci-cns3xxx.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/io.h>
-#include <linux/mmc/host.h>
-
-#include "sdhci.h"
-#include "sdhci-pltfm.h"
-
-static u16 sdhci_dove_readw(struct sdhci_host *host, int reg)
-{
-	u16 ret;
-
-	switch (reg) {
-	case SDHCI_HOST_VERSION:
-	case SDHCI_SLOT_INT_STATUS:
-		/* those registers don't exist */
-		return 0;
-	default:
-		ret = readw(host->ioaddr + reg);
-	}
-	return ret;
-}
-
-static u32 sdhci_dove_readl(struct sdhci_host *host, int reg)
-{
-	u32 ret;
-
-	switch (reg) {
-	case SDHCI_CAPABILITIES:
-		ret = readl(host->ioaddr + reg);
-		/* Mask the support for 3.0V */
-		ret &= ~SDHCI_CAN_VDD_300;
-		break;
-	default:
-		ret = readl(host->ioaddr + reg);
-	}
-	return ret;
-}
-
-static struct sdhci_ops sdhci_dove_ops = {
-	.read_w	= sdhci_dove_readw,
-	.read_l	= sdhci_dove_readl,
-};
-
-struct sdhci_pltfm_data sdhci_dove_pdata = {
-	.ops	= &sdhci_dove_ops,
-	.quirks	= SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER |
-		  SDHCI_QUIRK_NO_BUSY_IRQ |
-		  SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
-		  SDHCI_QUIRK_FORCE_DMA,
-};
diff --git a/trunk/drivers/mmc/host/sdhci-pci.c b/trunk/drivers/mmc/host/sdhci-pci.c
index 0dc905b20eee..3d9c2460d437 100644
--- a/trunk/drivers/mmc/host/sdhci-pci.c
+++ b/trunk/drivers/mmc/host/sdhci-pci.c
@@ -176,74 +176,6 @@ static const struct sdhci_pci_fixes sdhci_intel_mfd_emmc_sdio = {
 	.quirks		= SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
 };
 
-/* O2Micro extra registers */
-#define O2_SD_LOCK_WP		0xD3
-#define O2_SD_MULTI_VCC3V	0xEE
-#define O2_SD_CLKREQ		0xEC
-#define O2_SD_CAPS		0xE0
-#define O2_SD_ADMA1		0xE2
-#define O2_SD_ADMA2		0xE7
-#define O2_SD_INF_MOD		0xF1
-
-static int o2_probe(struct sdhci_pci_chip *chip)
-{
-	int ret;
-	u8 scratch;
-
-	switch (chip->pdev->device) {
-	case PCI_DEVICE_ID_O2_8220:
-	case PCI_DEVICE_ID_O2_8221:
-	case PCI_DEVICE_ID_O2_8320:
-	case PCI_DEVICE_ID_O2_8321:
-		/* This extra setup is required due to broken ADMA. */
-		ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch);
-		if (ret)
-			return ret;
-		scratch &= 0x7f;
-		pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch);
-
-		/* Set Multi 3 to VCC3V# */
-		pci_write_config_byte(chip->pdev, O2_SD_MULTI_VCC3V, 0x08);
-
-		/* Disable CLK_REQ# support after media DET */
-		ret = pci_read_config_byte(chip->pdev, O2_SD_CLKREQ, &scratch);
-		if (ret)
-			return ret;
-		scratch |= 0x20;
-		pci_write_config_byte(chip->pdev, O2_SD_CLKREQ, scratch);
-
-		/* Choose capabilities, enable SDMA.  We have to write 0x01
-		 * to the capabilities register first to unlock it.
-		 */
-		ret = pci_read_config_byte(chip->pdev, O2_SD_CAPS, &scratch);
-		if (ret)
-			return ret;
-		scratch |= 0x01;
-		pci_write_config_byte(chip->pdev, O2_SD_CAPS, scratch);
-		pci_write_config_byte(chip->pdev, O2_SD_CAPS, 0x73);
-
-		/* Disable ADMA1/2 */
-		pci_write_config_byte(chip->pdev, O2_SD_ADMA1, 0x39);
-		pci_write_config_byte(chip->pdev, O2_SD_ADMA2, 0x08);
-
-		/* Disable the infinite transfer mode */
-		ret = pci_read_config_byte(chip->pdev, O2_SD_INF_MOD, &scratch);
-		if (ret)
-			return ret;
-		scratch |= 0x08;
-		pci_write_config_byte(chip->pdev, O2_SD_INF_MOD, scratch);
-
-		/* Lock WP */
-		ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch);
-		if (ret)
-			return ret;
-		scratch |= 0x80;
-		pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch);
-	}
-
-	return 0;
-}
-
 static int jmicron_pmos(struct sdhci_pci_chip *chip, int on)
 {
 	u8 scratch;
@@ -272,7 +204,6 @@ static int jmicron_pmos(struct sdhci_pci_chip *chip, int on)
 static int jmicron_probe(struct sdhci_pci_chip *chip)
 {
 	int ret;
-	u16 mmcdev = 0;
 
 	if (chip->pdev->revision == 0) {
 		chip->quirks |= SDHCI_QUIRK_32BIT_DMA_ADDR |
@@ -294,17 +225,12 @@ static int jmicron_probe(struct sdhci_pci_chip *chip)
 	 * 2. The MMC interface has a lower subfunction number
 	 *    than the SD interface.
 	 */
-	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_SD)
-		mmcdev = PCI_DEVICE_ID_JMICRON_JMB38X_MMC;
-	else if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_SD)
-		mmcdev = PCI_DEVICE_ID_JMICRON_JMB388_ESD;
-
-	if (mmcdev) {
+	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_SD) {
 		struct pci_dev *sd_dev;
 
 		sd_dev = NULL;
 		while ((sd_dev = pci_get_device(PCI_VENDOR_ID_JMICRON,
-						mmcdev, sd_dev)) != NULL) {
+			PCI_DEVICE_ID_JMICRON_JMB38X_MMC, sd_dev)) != NULL) {
 			if ((PCI_SLOT(chip->pdev->devfn) ==
 				PCI_SLOT(sd_dev->devfn)) &&
 				(chip->pdev->bus == sd_dev->bus))
@@ -364,25 +290,13 @@ static int jmicron_probe_slot(struct sdhci_pci_slot *slot)
 			slot->host->quirks |= SDHCI_QUIRK_BROKEN_ADMA;
 	}
 
-	/* JM388 MMC doesn't support 1.8V while SD supports it */
-	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
-		slot->host->ocr_avail_sd = MMC_VDD_32_33 | MMC_VDD_33_34 |
-			MMC_VDD_29_30 | MMC_VDD_30_31 |
-			MMC_VDD_165_195; /* allow 1.8V */
-		slot->host->ocr_avail_mmc = MMC_VDD_32_33 | MMC_VDD_33_34 |
-			MMC_VDD_29_30 | MMC_VDD_30_31; /* no 1.8V for MMC */
-	}
-
 	/*
 	 * The secondary interface requires a bit set to get the
 	 * interrupts.
 	 */
-	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
-	    slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD)
+	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC)
 		jmicron_enable_mmc(slot->host, 1);
 
-	slot->host->mmc->caps |= MMC_CAP_BUS_WIDTH_TEST;
-
 	return 0;
 }
 
@@ -391,8 +305,7 @@ static void jmicron_remove_slot(struct sdhci_pci_slot *slot, int dead)
 	if (dead)
 		return;
 
-	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
-	    slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD)
+	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC)
 		jmicron_enable_mmc(slot->host, 0);
 }
 
@@ -400,8 +313,7 @@ static int jmicron_suspend(struct sdhci_pci_chip *chip, pm_message_t state)
 {
 	int i;
 
-	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
-	    chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
+	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC) {
 		for (i = 0;i < chip->num_slots;i++)
 			jmicron_enable_mmc(chip->slots[i]->host, 0);
 	}
@@ -413,8 +325,7 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
 {
 	int ret, i;
 
-	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
-	    chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
+	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC) {
 		for (i = 0;i < chip->num_slots;i++)
 			jmicron_enable_mmc(chip->slots[i]->host, 1);
 	}
@@ -428,10 +339,6 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
 	return 0;
 }
 
-static const struct sdhci_pci_fixes sdhci_o2 = {
-	.probe		= o2_probe,
-};
-
 static const struct sdhci_pci_fixes sdhci_jmicron = {
 	.probe		= jmicron_probe,
 
@@ -602,22 +509,6 @@ static const struct pci_device_id pci_ids[] __devinitdata = {
 		.driver_data	= (kernel_ulong_t)&sdhci_jmicron,
 	},
 
-	{
-		.vendor		= PCI_VENDOR_ID_JMICRON,
-		.device		= PCI_DEVICE_ID_JMICRON_JMB388_SD,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_jmicron,
-	},
-
-	{
-		.vendor		= PCI_VENDOR_ID_JMICRON,
-		.device		= PCI_DEVICE_ID_JMICRON_JMB388_ESD,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_jmicron,
-	},
-
 	{
 		.vendor		= PCI_VENDOR_ID_SYSKONNECT,
 		.device		= 0x8000,
@@ -698,46 +589,6 @@ static const struct pci_device_id pci_ids[] __devinitdata = {
 		.driver_data	= (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio,
 	},
 
-	{
-		.vendor		= PCI_VENDOR_ID_O2,
-		.device		= PCI_DEVICE_ID_O2_8120,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_o2,
-	},
-
-	{
-		.vendor		= PCI_VENDOR_ID_O2,
-		.device		= PCI_DEVICE_ID_O2_8220,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_o2,
-	},
-
-	{
-		.vendor		= PCI_VENDOR_ID_O2,
-		.device		= PCI_DEVICE_ID_O2_8221,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_o2,
-	},
-
-	{
-		.vendor		= PCI_VENDOR_ID_O2,
-		.device		= PCI_DEVICE_ID_O2_8320,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_o2,
-	},
-
-	{
-		.vendor		= PCI_VENDOR_ID_O2,
-		.device		= PCI_DEVICE_ID_O2_8321,
-		.subvendor	= PCI_ANY_ID,
-		.subdevice	= PCI_ANY_ID,
-		.driver_data	= (kernel_ulong_t)&sdhci_o2,
-	},
-
 	{	/* Generic SD host controller */
 		PCI_DEVICE_CLASS((PCI_CLASS_SYSTEM_SDHCI << 8), 0xFFFF00)
 	},
diff --git a/trunk/drivers/mmc/host/sdhci-pltfm.c b/trunk/drivers/mmc/host/sdhci-pltfm.c
index dbab0407f4b6..0502f89f662b 100644
--- a/trunk/drivers/mmc/host/sdhci-pltfm.c
+++ b/trunk/drivers/mmc/host/sdhci-pltfm.c
@@ -169,12 +169,6 @@ static const struct platform_device_id sdhci_pltfm_ids[] = {
 #endif
 #ifdef CONFIG_MMC_SDHCI_ESDHC_IMX
 	{ "sdhci-esdhc-imx", (kernel_ulong_t)&sdhci_esdhc_imx_pdata },
-#endif
-#ifdef CONFIG_MMC_SDHCI_DOVE
-	{ "sdhci-dove", (kernel_ulong_t)&sdhci_dove_pdata },
-#endif
-#ifdef CONFIG_MMC_SDHCI_TEGRA
-	{ "sdhci-tegra", (kernel_ulong_t)&sdhci_tegra_pdata },
 #endif
 	{ },
 };
diff --git a/trunk/drivers/mmc/host/sdhci-pltfm.h b/trunk/drivers/mmc/host/sdhci-pltfm.h
index ea2e44d9be5e..c1bfe48af56a 100644
--- a/trunk/drivers/mmc/host/sdhci-pltfm.h
+++ b/trunk/drivers/mmc/host/sdhci-pltfm.h
@@ -22,7 +22,5 @@ struct sdhci_pltfm_host {
 
 extern struct sdhci_pltfm_data sdhci_cns3xxx_pdata;
 extern struct sdhci_pltfm_data sdhci_esdhc_imx_pdata;
-extern struct sdhci_pltfm_data sdhci_dove_pdata;
-extern struct sdhci_pltfm_data sdhci_tegra_pdata;
 
 #endif /* _DRIVERS_MMC_SDHCI_PLTFM_H */
diff --git a/trunk/drivers/mmc/host/sdhci-s3c.c b/trunk/drivers/mmc/host/sdhci-s3c.c
index 17203586305c..aacb862ecc8a 100644
--- a/trunk/drivers/mmc/host/sdhci-s3c.c
+++ b/trunk/drivers/mmc/host/sdhci-s3c.c
@@ -130,15 +130,6 @@ static unsigned int sdhci_s3c_consider_clock(struct sdhci_s3c *ourhost,
 	if (!clksrc)
 		return UINT_MAX;
 
-	/*
-	 * Clock divider's step is different as 1 from that of host controller
-	 * when 'clk_type' is S3C_SDHCI_CLK_DIV_EXTERNAL.
-	 */
-	if (ourhost->pdata->clk_type) {
-		rate = clk_round_rate(clksrc, wanted);
-		return wanted - rate;
-	}
-
 	rate = clk_get_rate(clksrc);
 
 	for (div = 1; div < 256; div *= 2) {
@@ -241,42 +232,6 @@ static unsigned int sdhci_s3c_get_min_clock(struct sdhci_host *host)
 	return min;
 }
 
-/* sdhci_cmu_get_max_clk - callback to get maximum clock frequency.*/
-static unsigned int sdhci_cmu_get_max_clock(struct sdhci_host *host)
-{
-	struct sdhci_s3c *ourhost = to_s3c(host);
-
-	return clk_round_rate(ourhost->clk_bus[ourhost->cur_clk], UINT_MAX);
-}
-
-/* sdhci_cmu_get_min_clock - callback to get minimal supported clock value. */
-static unsigned int sdhci_cmu_get_min_clock(struct sdhci_host *host)
-{
-	struct sdhci_s3c *ourhost = to_s3c(host);
-
-	/*
-	 * initial clock can be in the frequency range of
-	 * 100KHz-400KHz, so we set it as max value.
-	 */
-	return clk_round_rate(ourhost->clk_bus[ourhost->cur_clk], 400000);
-}
-
-/* sdhci_cmu_set_clock - callback on clock change.*/
-static void sdhci_cmu_set_clock(struct sdhci_host *host, unsigned int clock)
-{
-	struct sdhci_s3c *ourhost = to_s3c(host);
-
-	/* don't bother if the clock is going off */
-	if (clock == 0)
-		return;
-
-	sdhci_s3c_set_clock(host, clock);
-
-	clk_set_rate(ourhost->clk_bus[ourhost->cur_clk], clock);
-
-	host->clock = clock;
-}
-
 static struct sdhci_ops sdhci_s3c_ops = {
 	.get_max_clock		= sdhci_s3c_get_max_clk,
 	.set_clock		= sdhci_s3c_set_clock,
@@ -406,13 +361,6 @@ static int __devinit sdhci_s3c_probe(struct platform_device *pdev)
 
 		clks++;
 		sc->clk_bus[ptr] = clk;
-
-		/*
-		 * save current clock index to know which clock bus
-		 * is used later in overriding functions.
-		 */
-		sc->cur_clk = ptr;
-
 		clk_enable(clk);
 
 		dev_info(dev, "clock source %d: %s (%ld Hz)\n",
@@ -479,20 +427,6 @@ static int __devinit sdhci_s3c_probe(struct platform_device *pdev)
 	/* HSMMC on Samsung SoCs uses SDCLK as timeout clock */
 	host->quirks |= SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK;
 
-	/*
-	 * If controller does not have internal clock divider,
-	 * we can use overriding functions instead of default.
-	 */
-	if (pdata->clk_type) {
-		sdhci_s3c_ops.set_clock = sdhci_cmu_set_clock;
-		sdhci_s3c_ops.get_min_clock = sdhci_cmu_get_min_clock;
-		sdhci_s3c_ops.get_max_clock = sdhci_cmu_get_max_clock;
-	}
-
-	/* It supports additional host capabilities if needed */
-	if (pdata->host_caps)
-		host->mmc->caps |= pdata->host_caps;
-
 	ret = sdhci_add_host(host);
 	if (ret) {
 		dev_err(dev, "sdhci_add_host() failed\n");
diff --git a/trunk/drivers/mmc/host/sdhci-tegra.c b/trunk/drivers/mmc/host/sdhci-tegra.c
deleted file mode 100644
index 4823ee94a63f..000000000000
--- a/trunk/drivers/mmc/host/sdhci-tegra.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (C) 2010 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/gpio.h>
-#include <linux/mmc/card.h>
-#include <linux/mmc/host.h>
-
-#include <mach/gpio.h>
-#include <mach/sdhci.h>
-
-#include "sdhci.h"
-#include "sdhci-pltfm.h"
-
-static u32 tegra_sdhci_readl(struct sdhci_host *host, int reg)
-{
-	u32 val;
-
-	if (unlikely(reg == SDHCI_PRESENT_STATE)) {
-		/* Use wp_gpio here instead? */
-		val = readl(host->ioaddr + reg);
-		return val | SDHCI_WRITE_PROTECT;
-	}
-
-	return readl(host->ioaddr + reg);
-}
-
-static u16 tegra_sdhci_readw(struct sdhci_host *host, int reg)
-{
-	if (unlikely(reg == SDHCI_HOST_VERSION)) {
-		/* Erratum: Version register is invalid in HW. */
-		return SDHCI_SPEC_200;
-	}
-
-	return readw(host->ioaddr + reg);
-}
-
-static void tegra_sdhci_writel(struct sdhci_host *host, u32 val, int reg)
-{
-	/* Seems like we're getting spurious timeout and crc errors, so
-	 * disable signalling of them. In case of real errors software
-	 * timers should take care of eventually detecting them.
-	 */
-	if (unlikely(reg == SDHCI_SIGNAL_ENABLE))
-		val &= ~(SDHCI_INT_TIMEOUT|SDHCI_INT_CRC);
-
-	writel(val, host->ioaddr + reg);
-
-	if (unlikely(reg == SDHCI_INT_ENABLE)) {
-		/* Erratum: Must enable block gap interrupt detection */
-		u8 gap_ctrl = readb(host->ioaddr + SDHCI_BLOCK_GAP_CONTROL);
-		if (val & SDHCI_INT_CARD_INT)
-			gap_ctrl |= 0x8;
-		else
-			gap_ctrl &= ~0x8;
-		writeb(gap_ctrl, host->ioaddr + SDHCI_BLOCK_GAP_CONTROL);
-	}
-}
-
-static unsigned int tegra_sdhci_get_ro(struct sdhci_host *sdhci)
-{
-	struct platform_device *pdev = to_platform_device(mmc_dev(sdhci->mmc));
-	struct tegra_sdhci_platform_data *plat;
-
-	plat = pdev->dev.platform_data;
-
-	if (!gpio_is_valid(plat->wp_gpio))
-		return -1;
-
-	return gpio_get_value(plat->wp_gpio);
-}
-
-static irqreturn_t carddetect_irq(int irq, void *data)
-{
-	struct sdhci_host *sdhost = (struct sdhci_host *)data;
-
-	tasklet_schedule(&sdhost->card_tasklet);
-	return IRQ_HANDLED;
-};
-
-static int tegra_sdhci_8bit(struct sdhci_host *host, int bus_width)
-{
-	struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
-	struct tegra_sdhci_platform_data *plat;
-	u32 ctrl;
-
-	plat = pdev->dev.platform_data;
-
-	ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL);
-	if (plat->is_8bit && bus_width == MMC_BUS_WIDTH_8) {
-		ctrl &= ~SDHCI_CTRL_4BITBUS;
-		ctrl |= SDHCI_CTRL_8BITBUS;
-	} else {
-		ctrl &= ~SDHCI_CTRL_8BITBUS;
-		if (bus_width == MMC_BUS_WIDTH_4)
-			ctrl |= SDHCI_CTRL_4BITBUS;
-		else
-			ctrl &= ~SDHCI_CTRL_4BITBUS;
-	}
-	sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL);
-	return 0;
-}
-
-
-static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
-				  struct sdhci_pltfm_data *pdata)
-{
-	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
-	struct tegra_sdhci_platform_data *plat;
-	struct clk *clk;
-	int rc;
-
-	plat = pdev->dev.platform_data;
-	if (plat == NULL) {
-		dev_err(mmc_dev(host->mmc), "missing platform data\n");
-		return -ENXIO;
-	}
-
-	if (gpio_is_valid(plat->power_gpio)) {
-		rc = gpio_request(plat->power_gpio, "sdhci_power");
-		if (rc) {
-			dev_err(mmc_dev(host->mmc),
-				"failed to allocate power gpio\n");
-			goto out;
-		}
-		tegra_gpio_enable(plat->power_gpio);
-		gpio_direction_output(plat->power_gpio, 1);
-	}
-
-	if (gpio_is_valid(plat->cd_gpio)) {
-		rc = gpio_request(plat->cd_gpio, "sdhci_cd");
-		if (rc) {
-			dev_err(mmc_dev(host->mmc),
-				"failed to allocate cd gpio\n");
-			goto out_power;
-		}
-		tegra_gpio_enable(plat->cd_gpio);
-		gpio_direction_input(plat->cd_gpio);
-
-		rc = request_irq(gpio_to_irq(plat->cd_gpio), carddetect_irq,
-				 IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
-				 mmc_hostname(host->mmc), host);
-
-		if (rc)	{
-			dev_err(mmc_dev(host->mmc), "request irq error\n");
-			goto out_cd;
-		}
-
-	}
-
-	if (gpio_is_valid(plat->wp_gpio)) {
-		rc = gpio_request(plat->wp_gpio, "sdhci_wp");
-		if (rc) {
-			dev_err(mmc_dev(host->mmc),
-				"failed to allocate wp gpio\n");
-			goto out_cd;
-		}
-		tegra_gpio_enable(plat->wp_gpio);
-		gpio_direction_input(plat->wp_gpio);
-	}
-
-	clk = clk_get(mmc_dev(host->mmc), NULL);
-	if (IS_ERR(clk)) {
-		dev_err(mmc_dev(host->mmc), "clk err\n");
-		rc = PTR_ERR(clk);
-		goto out_wp;
-	}
-	clk_enable(clk);
-	pltfm_host->clk = clk;
-
-	if (plat->is_8bit)
-		host->mmc->caps |= MMC_CAP_8_BIT_DATA;
-
-	return 0;
-
-out_wp:
-	if (gpio_is_valid(plat->wp_gpio)) {
-		tegra_gpio_disable(plat->wp_gpio);
-		gpio_free(plat->wp_gpio);
-	}
-
-out_cd:
-	if (gpio_is_valid(plat->cd_gpio)) {
-		tegra_gpio_disable(plat->cd_gpio);
-		gpio_free(plat->cd_gpio);
-	}
-
-out_power:
-	if (gpio_is_valid(plat->power_gpio)) {
-		tegra_gpio_disable(plat->power_gpio);
-		gpio_free(plat->power_gpio);
-	}
-
-out:
-	return rc;
-}
-
-static void tegra_sdhci_pltfm_exit(struct sdhci_host *host)
-{
-	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
-	struct tegra_sdhci_platform_data *plat;
-
-	plat = pdev->dev.platform_data;
-
-	if (gpio_is_valid(plat->wp_gpio)) {
-		tegra_gpio_disable(plat->wp_gpio);
-		gpio_free(plat->wp_gpio);
-	}
-
-	if (gpio_is_valid(plat->cd_gpio)) {
-		tegra_gpio_disable(plat->cd_gpio);
-		gpio_free(plat->cd_gpio);
-	}
-
-	if (gpio_is_valid(plat->power_gpio)) {
-		tegra_gpio_disable(plat->power_gpio);
-		gpio_free(plat->power_gpio);
-	}
-
-	clk_disable(pltfm_host->clk);
-	clk_put(pltfm_host->clk);
-}
-
-static struct sdhci_ops tegra_sdhci_ops = {
-	.get_ro     = tegra_sdhci_get_ro,
-	.read_l     = tegra_sdhci_readl,
-	.read_w     = tegra_sdhci_readw,
-	.write_l    = tegra_sdhci_writel,
-	.platform_8bit_width = tegra_sdhci_8bit,
-};
-
-struct sdhci_pltfm_data sdhci_tegra_pdata = {
-	.quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
-		  SDHCI_QUIRK_SINGLE_POWER_WRITE |
-		  SDHCI_QUIRK_NO_HISPD_BIT |
-		  SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC,
-	.ops  = &tegra_sdhci_ops,
-	.init = tegra_sdhci_pltfm_init,
-	.exit = tegra_sdhci_pltfm_exit,
-};
diff --git a/trunk/drivers/mmc/host/sdhci.c b/trunk/drivers/mmc/host/sdhci.c
index 9e15f41f87be..a25db426c910 100644
--- a/trunk/drivers/mmc/host/sdhci.c
+++ b/trunk/drivers/mmc/host/sdhci.c
@@ -23,7 +23,6 @@
 
 #include <linux/leds.h>
 
-#include <linux/mmc/mmc.h>
 #include <linux/mmc/host.h>
 
 #include "sdhci.h"
@@ -78,11 +77,8 @@ static void sdhci_dumpregs(struct sdhci_host *host)
 	printk(KERN_DEBUG DRIVER_NAME ": AC12 err: 0x%08x | Slot int: 0x%08x\n",
 		sdhci_readw(host, SDHCI_ACMD12_ERR),
 		sdhci_readw(host, SDHCI_SLOT_INT_STATUS));
-	printk(KERN_DEBUG DRIVER_NAME ": Caps:     0x%08x | Caps_1:   0x%08x\n",
+	printk(KERN_DEBUG DRIVER_NAME ": Caps:     0x%08x | Max curr: 0x%08x\n",
 		sdhci_readl(host, SDHCI_CAPABILITIES),
-		sdhci_readl(host, SDHCI_CAPABILITIES_1));
-	printk(KERN_DEBUG DRIVER_NAME ": Cmd:      0x%08x | Max curr: 0x%08x\n",
-		sdhci_readw(host, SDHCI_COMMAND),
 		sdhci_readl(host, SDHCI_MAX_CURRENT));
 
 	if (host->flags & SDHCI_USE_ADMA)
@@ -1522,11 +1518,7 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 
 	if (intmask & SDHCI_INT_DATA_TIMEOUT)
 		host->data->error = -ETIMEDOUT;
-	else if (intmask & SDHCI_INT_DATA_END_BIT)
-		host->data->error = -EILSEQ;
-	else if ((intmask & SDHCI_INT_DATA_CRC) &&
-		SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))
-			!= MMC_BUS_TEST_R)
+	else if (intmask & (SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_END_BIT))
 		host->data->error = -EILSEQ;
 	else if (intmask & SDHCI_INT_ADMA_ERROR) {
 		printk(KERN_ERR "%s: ADMA error\n", mmc_hostname(host->mmc));
@@ -1744,7 +1736,7 @@ EXPORT_SYMBOL_GPL(sdhci_alloc_host);
 int sdhci_add_host(struct sdhci_host *host)
 {
 	struct mmc_host *mmc;
-	unsigned int caps, ocr_avail;
+	unsigned int caps;
 	int ret;
 
 	WARN_ON(host == NULL);
@@ -1898,26 +1890,13 @@ int sdhci_add_host(struct sdhci_host *host)
 	    mmc_card_is_removable(mmc))
 		mmc->caps |= MMC_CAP_NEEDS_POLL;
 
-	ocr_avail = 0;
+	mmc->ocr_avail = 0;
 	if (caps & SDHCI_CAN_VDD_330)
-		ocr_avail |= MMC_VDD_32_33 | MMC_VDD_33_34;
+		mmc->ocr_avail |= MMC_VDD_32_33|MMC_VDD_33_34;
 	if (caps & SDHCI_CAN_VDD_300)
-		ocr_avail |= MMC_VDD_29_30 | MMC_VDD_30_31;
+		mmc->ocr_avail |= MMC_VDD_29_30|MMC_VDD_30_31;
 	if (caps & SDHCI_CAN_VDD_180)
-		ocr_avail |= MMC_VDD_165_195;
-
-	mmc->ocr_avail = ocr_avail;
-	mmc->ocr_avail_sdio = ocr_avail;
-	if (host->ocr_avail_sdio)
-		mmc->ocr_avail_sdio &= host->ocr_avail_sdio;
-	mmc->ocr_avail_sd = ocr_avail;
-	if (host->ocr_avail_sd)
-		mmc->ocr_avail_sd &= host->ocr_avail_sd;
-	else /* normal SD controllers don't support 1.8V */
-		mmc->ocr_avail_sd &= ~MMC_VDD_165_195;
-	mmc->ocr_avail_mmc = ocr_avail;
-	if (host->ocr_avail_mmc)
-		mmc->ocr_avail_mmc &= host->ocr_avail_mmc;
+		mmc->ocr_avail |= MMC_VDD_165_195;
 
 	if (mmc->ocr_avail == 0) {
 		printk(KERN_ERR "%s: Hardware doesn't report any "
@@ -1949,14 +1928,10 @@ int sdhci_add_host(struct sdhci_host *host)
 	 * of bytes. When doing hardware scatter/gather, each entry cannot
 	 * be larger than 64 KiB though.
 	 */
-	if (host->flags & SDHCI_USE_ADMA) {
-		if (host->quirks & SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC)
-			mmc->max_seg_size = 65535;
-		else
-			mmc->max_seg_size = 65536;
-	} else {
+	if (host->flags & SDHCI_USE_ADMA)
+		mmc->max_seg_size = 65536;
+	else
 		mmc->max_seg_size = mmc->max_req_size;
-	}
 
 	/*
 	 * Maximum block size. This varies from controller to controller and
diff --git a/trunk/drivers/mmc/host/sdhci.h b/trunk/drivers/mmc/host/sdhci.h
index 6e0969e40650..e42d7f00c060 100644
--- a/trunk/drivers/mmc/host/sdhci.h
+++ b/trunk/drivers/mmc/host/sdhci.h
@@ -52,7 +52,6 @@
 #define  SDHCI_CMD_RESP_SHORT_BUSY 0x03
 
 #define SDHCI_MAKE_CMD(c, f) (((c & 0xff) << 8) | (f & 0xff))
-#define SDHCI_GET_CMD(c) ((c>>8) & 0x3f)
 
 #define SDHCI_RESPONSE		0x10
 
@@ -166,7 +165,7 @@
 #define  SDHCI_CAN_VDD_180	0x04000000
 #define  SDHCI_CAN_64BIT	0x10000000
 
-#define SDHCI_CAPABILITIES_1	0x44
+/* 44-47 reserved for more caps */
 
 #define SDHCI_MAX_CURRENT	0x48
 
diff --git a/trunk/drivers/mmc/host/tmio_mmc.c b/trunk/drivers/mmc/host/tmio_mmc.c
index e3c6ef208391..e7765a89593e 100644
--- a/trunk/drivers/mmc/host/tmio_mmc.c
+++ b/trunk/drivers/mmc/host/tmio_mmc.c
@@ -25,261 +25,16 @@
  *   double buffer support
  *
  */
-
-#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/irq.h>
 #include <linux/device.h>
+#include <linux/delay.h>
 #include <linux/dmaengine.h>
-#include <linux/highmem.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
+#include <linux/mmc/host.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/tmio.h>
-#include <linux/mmc/host.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/scatterlist.h>
-#include <linux/workqueue.h>
-#include <linux/spinlock.h>
-
-#define CTL_SD_CMD 0x00
-#define CTL_ARG_REG 0x04
-#define CTL_STOP_INTERNAL_ACTION 0x08
-#define CTL_XFER_BLK_COUNT 0xa
-#define CTL_RESPONSE 0x0c
-#define CTL_STATUS 0x1c
-#define CTL_IRQ_MASK 0x20
-#define CTL_SD_CARD_CLK_CTL 0x24
-#define CTL_SD_XFER_LEN 0x26
-#define CTL_SD_MEM_CARD_OPT 0x28
-#define CTL_SD_ERROR_DETAIL_STATUS 0x2c
-#define CTL_SD_DATA_PORT 0x30
-#define CTL_TRANSACTION_CTL 0x34
-#define CTL_SDIO_STATUS 0x36
-#define CTL_SDIO_IRQ_MASK 0x38
-#define CTL_RESET_SD 0xe0
-#define CTL_SDIO_REGS 0x100
-#define CTL_CLK_AND_WAIT_CTL 0x138
-#define CTL_RESET_SDIO 0x1e0
-
-/* Definitions for values the CTRL_STATUS register can take. */
-#define TMIO_STAT_CMDRESPEND    0x00000001
-#define TMIO_STAT_DATAEND       0x00000004
-#define TMIO_STAT_CARD_REMOVE   0x00000008
-#define TMIO_STAT_CARD_INSERT   0x00000010
-#define TMIO_STAT_SIGSTATE      0x00000020
-#define TMIO_STAT_WRPROTECT     0x00000080
-#define TMIO_STAT_CARD_REMOVE_A 0x00000100
-#define TMIO_STAT_CARD_INSERT_A 0x00000200
-#define TMIO_STAT_SIGSTATE_A    0x00000400
-#define TMIO_STAT_CMD_IDX_ERR   0x00010000
-#define TMIO_STAT_CRCFAIL       0x00020000
-#define TMIO_STAT_STOPBIT_ERR   0x00040000
-#define TMIO_STAT_DATATIMEOUT   0x00080000
-#define TMIO_STAT_RXOVERFLOW    0x00100000
-#define TMIO_STAT_TXUNDERRUN    0x00200000
-#define TMIO_STAT_CMDTIMEOUT    0x00400000
-#define TMIO_STAT_RXRDY         0x01000000
-#define TMIO_STAT_TXRQ          0x02000000
-#define TMIO_STAT_ILL_FUNC      0x20000000
-#define TMIO_STAT_CMD_BUSY      0x40000000
-#define TMIO_STAT_ILL_ACCESS    0x80000000
-
-/* Definitions for values the CTRL_SDIO_STATUS register can take. */
-#define TMIO_SDIO_STAT_IOIRQ	0x0001
-#define TMIO_SDIO_STAT_EXPUB52	0x4000
-#define TMIO_SDIO_STAT_EXWT	0x8000
-#define TMIO_SDIO_MASK_ALL	0xc007
-
-/* Define some IRQ masks */
-/* This is the mask used at reset by the chip */
-#define TMIO_MASK_ALL           0x837f031d
-#define TMIO_MASK_READOP  (TMIO_STAT_RXRDY | TMIO_STAT_DATAEND)
-#define TMIO_MASK_WRITEOP (TMIO_STAT_TXRQ | TMIO_STAT_DATAEND)
-#define TMIO_MASK_CMD     (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT | \
-		TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT)
-#define TMIO_MASK_IRQ     (TMIO_MASK_READOP | TMIO_MASK_WRITEOP | TMIO_MASK_CMD)
-
-#define enable_mmc_irqs(host, i) \
-	do { \
-		u32 mask;\
-		mask  = sd_ctrl_read32((host), CTL_IRQ_MASK); \
-		mask &= ~((i) & TMIO_MASK_IRQ); \
-		sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
-	} while (0)
-
-#define disable_mmc_irqs(host, i) \
-	do { \
-		u32 mask;\
-		mask  = sd_ctrl_read32((host), CTL_IRQ_MASK); \
-		mask |= ((i) & TMIO_MASK_IRQ); \
-		sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
-	} while (0)
-
-#define ack_mmc_irqs(host, i) \
-	do { \
-		sd_ctrl_write32((host), CTL_STATUS, ~(i)); \
-	} while (0)
-
-/* This is arbitrary, just noone needed any higher alignment yet */
-#define MAX_ALIGN 4
-
-struct tmio_mmc_host {
-	void __iomem *ctl;
-	unsigned long bus_shift;
-	struct mmc_command      *cmd;
-	struct mmc_request      *mrq;
-	struct mmc_data         *data;
-	struct mmc_host         *mmc;
-	int                     irq;
-	unsigned int		sdio_irq_enabled;
-
-	/* Callbacks for clock / power control */
-	void (*set_pwr)(struct platform_device *host, int state);
-	void (*set_clk_div)(struct platform_device *host, int state);
-
-	/* pio related stuff */
-	struct scatterlist      *sg_ptr;
-	struct scatterlist      *sg_orig;
-	unsigned int            sg_len;
-	unsigned int            sg_off;
-
-	struct platform_device *pdev;
-
-	/* DMA support */
-	struct dma_chan		*chan_rx;
-	struct dma_chan		*chan_tx;
-	struct tasklet_struct	dma_complete;
-	struct tasklet_struct	dma_issue;
-#ifdef CONFIG_TMIO_MMC_DMA
-	unsigned int            dma_sglen;
-	u8			bounce_buf[PAGE_CACHE_SIZE] __attribute__((aligned(MAX_ALIGN)));
-	struct scatterlist	bounce_sg;
-#endif
-
-	/* Track lost interrupts */
-	struct delayed_work	delayed_reset_work;
-	spinlock_t		lock;
-	unsigned long		last_req_ts;
-};
-
-static void tmio_check_bounce_buffer(struct tmio_mmc_host *host);
-
-static u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
-{
-	return readw(host->ctl + (addr << host->bus_shift));
-}
-
-static void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr,
-		u16 *buf, int count)
-{
-	readsw(host->ctl + (addr << host->bus_shift), buf, count);
-}
 
-static u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
-{
-	return readw(host->ctl + (addr << host->bus_shift)) |
-	       readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16;
-}
-
-static void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val)
-{
-	writew(val, host->ctl + (addr << host->bus_shift));
-}
-
-static void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr,
-		u16 *buf, int count)
-{
-	writesw(host->ctl + (addr << host->bus_shift), buf, count);
-}
-
-static void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val)
-{
-	writew(val, host->ctl + (addr << host->bus_shift));
-	writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift));
-}
-
-static void tmio_mmc_init_sg(struct tmio_mmc_host *host, struct mmc_data *data)
-{
-	host->sg_len = data->sg_len;
-	host->sg_ptr = data->sg;
-	host->sg_orig = data->sg;
-	host->sg_off = 0;
-}
-
-static int tmio_mmc_next_sg(struct tmio_mmc_host *host)
-{
-	host->sg_ptr = sg_next(host->sg_ptr);
-	host->sg_off = 0;
-	return --host->sg_len;
-}
-
-static char *tmio_mmc_kmap_atomic(struct scatterlist *sg, unsigned long *flags)
-{
-	local_irq_save(*flags);
-	return kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
-}
-
-static void tmio_mmc_kunmap_atomic(void *virt, unsigned long *flags)
-{
-	kunmap_atomic(virt, KM_BIO_SRC_IRQ);
-	local_irq_restore(*flags);
-}
-
-#ifdef CONFIG_MMC_DEBUG
-
-#define STATUS_TO_TEXT(a) \
-	do { \
-		if (status & TMIO_STAT_##a) \
-			printk(#a); \
-	} while (0)
-
-void pr_debug_status(u32 status)
-{
-	printk(KERN_DEBUG "status: %08x = ", status);
-	STATUS_TO_TEXT(CARD_REMOVE);
-	STATUS_TO_TEXT(CARD_INSERT);
-	STATUS_TO_TEXT(SIGSTATE);
-	STATUS_TO_TEXT(WRPROTECT);
-	STATUS_TO_TEXT(CARD_REMOVE_A);
-	STATUS_TO_TEXT(CARD_INSERT_A);
-	STATUS_TO_TEXT(SIGSTATE_A);
-	STATUS_TO_TEXT(CMD_IDX_ERR);
-	STATUS_TO_TEXT(STOPBIT_ERR);
-	STATUS_TO_TEXT(ILL_FUNC);
-	STATUS_TO_TEXT(CMD_BUSY);
-	STATUS_TO_TEXT(CMDRESPEND);
-	STATUS_TO_TEXT(DATAEND);
-	STATUS_TO_TEXT(CRCFAIL);
-	STATUS_TO_TEXT(DATATIMEOUT);
-	STATUS_TO_TEXT(CMDTIMEOUT);
-	STATUS_TO_TEXT(RXOVERFLOW);
-	STATUS_TO_TEXT(TXUNDERRUN);
-	STATUS_TO_TEXT(RXRDY);
-	STATUS_TO_TEXT(TXRQ);
-	STATUS_TO_TEXT(ILL_ACCESS);
-	printk("\n");
-}
-
-#else
-#define pr_debug_status(s)  do { } while (0)
-#endif
-
-static void tmio_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
-{
-	struct tmio_mmc_host *host = mmc_priv(mmc);
-
-	if (enable) {
-		host->sdio_irq_enabled = 1;
-		sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001);
-		sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK,
-			(TMIO_SDIO_MASK_ALL & ~TMIO_SDIO_STAT_IOIRQ));
-	} else {
-		sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, TMIO_SDIO_MASK_ALL);
-		sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0000);
-		host->sdio_irq_enabled = 0;
-	}
-}
+#include "tmio_mmc.h"
 
 static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
 {
@@ -300,23 +55,8 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
 
 static void tmio_mmc_clk_stop(struct tmio_mmc_host *host)
 {
-	struct mfd_cell *cell = host->pdev->dev.platform_data;
-	struct tmio_mmc_data *pdata = cell->driver_data;
-
-	/*
-	 * Testing on sh-mobile showed that SDIO IRQs are unmasked when
-	 * CTL_CLK_AND_WAIT_CTL gets written, so we have to disable the
-	 * device IRQ here and restore the SDIO IRQ mask before
-	 * re-enabling the device IRQ.
-	 */
-	if (pdata->flags & TMIO_MMC_SDIO_IRQ)
-		disable_irq(host->irq);
 	sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0000);
 	msleep(10);
-	if (pdata->flags & TMIO_MMC_SDIO_IRQ) {
-		tmio_mmc_enable_sdio_irq(host->mmc, host->sdio_irq_enabled);
-		enable_irq(host->irq);
-	}
 	sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, ~0x0100 &
 		sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL));
 	msleep(10);
@@ -324,21 +64,11 @@ static void tmio_mmc_clk_stop(struct tmio_mmc_host *host)
 
 static void tmio_mmc_clk_start(struct tmio_mmc_host *host)
 {
-	struct mfd_cell *cell = host->pdev->dev.platform_data;
-	struct tmio_mmc_data *pdata = cell->driver_data;
-
 	sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, 0x0100 |
 		sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL));
 	msleep(10);
-	/* see comment in tmio_mmc_clk_stop above */
-	if (pdata->flags & TMIO_MMC_SDIO_IRQ)
-		disable_irq(host->irq);
 	sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0100);
 	msleep(10);
-	if (pdata->flags & TMIO_MMC_SDIO_IRQ) {
-		tmio_mmc_enable_sdio_irq(host->mmc, host->sdio_irq_enabled);
-		enable_irq(host->irq);
-	}
 }
 
 static void reset(struct tmio_mmc_host *host)
@@ -352,60 +82,15 @@ static void reset(struct tmio_mmc_host *host)
 	msleep(10);
 }
 
-static void tmio_mmc_reset_work(struct work_struct *work)
-{
-	struct tmio_mmc_host *host = container_of(work, struct tmio_mmc_host,
-						  delayed_reset_work.work);
-	struct mmc_request *mrq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->lock, flags);
-	mrq = host->mrq;
-
-	/* request already finished */
-	if (!mrq
-	    || time_is_after_jiffies(host->last_req_ts +
-		msecs_to_jiffies(2000))) {
-		spin_unlock_irqrestore(&host->lock, flags);
-		return;
-	}
-
-	dev_warn(&host->pdev->dev,
-		"timeout waiting for hardware interrupt (CMD%u)\n",
-		mrq->cmd->opcode);
-
-	if (host->data)
-		host->data->error = -ETIMEDOUT;
-	else if (host->cmd)
-		host->cmd->error = -ETIMEDOUT;
-	else
-		mrq->cmd->error = -ETIMEDOUT;
-
-	host->cmd = NULL;
-	host->data = NULL;
-	host->mrq = NULL;
-
-	spin_unlock_irqrestore(&host->lock, flags);
-
-	reset(host);
-
-	mmc_request_done(host->mmc, mrq);
-}
-
 static void
 tmio_mmc_finish_request(struct tmio_mmc_host *host)
 {
 	struct mmc_request *mrq = host->mrq;
 
-	if (!mrq)
-		return;
-
 	host->mrq = NULL;
 	host->cmd = NULL;
 	host->data = NULL;
 
-	cancel_delayed_work(&host->delayed_reset_work);
-
 	mmc_request_done(host->mmc, mrq);
 }
 
@@ -515,7 +200,6 @@ static void tmio_mmc_pio_irq(struct tmio_mmc_host *host)
 	return;
 }
 
-/* needs to be called with host->lock held */
 static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
 {
 	struct mmc_data *data = host->data;
@@ -549,8 +233,6 @@ static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
 	if (data->flags & MMC_DATA_READ) {
 		if (!host->chan_rx)
 			disable_mmc_irqs(host, TMIO_MASK_READOP);
-		else
-			tmio_check_bounce_buffer(host);
 		dev_dbg(&host->pdev->dev, "Complete Rx request %p\n",
 			host->mrq);
 	} else {
@@ -572,12 +254,10 @@ static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
 
 static void tmio_mmc_data_irq(struct tmio_mmc_host *host)
 {
-	struct mmc_data *data;
-	spin_lock(&host->lock);
-	data = host->data;
+	struct mmc_data *data = host->data;
 
 	if (!data)
-		goto out;
+		return;
 
 	if (host->chan_tx && (data->flags & MMC_DATA_WRITE)) {
 		/*
@@ -598,8 +278,6 @@ static void tmio_mmc_data_irq(struct tmio_mmc_host *host)
 	} else {
 		tmio_mmc_do_data_irq(host);
 	}
-out:
-	spin_unlock(&host->lock);
 }
 
 static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
@@ -608,11 +286,9 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
 	struct mmc_command *cmd = host->cmd;
 	int i, addr;
 
-	spin_lock(&host->lock);
-
 	if (!host->cmd) {
 		pr_debug("Spurious CMD irq\n");
-		goto out;
+		return;
 	}
 
 	host->cmd = NULL;
@@ -648,7 +324,8 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
 			if (!host->chan_rx)
 				enable_mmc_irqs(host, TMIO_MASK_READOP);
 		} else {
-			if (!host->chan_tx)
+			struct dma_chan *chan = host->chan_tx;
+			if (!chan)
 				enable_mmc_irqs(host, TMIO_MASK_WRITEOP);
 			else
 				tasklet_schedule(&host->dma_issue);
@@ -657,19 +334,13 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
 		tmio_mmc_finish_request(host);
 	}
 
-out:
-	spin_unlock(&host->lock);
-
 	return;
 }
 
 static irqreturn_t tmio_mmc_irq(int irq, void *devid)
 {
 	struct tmio_mmc_host *host = devid;
-	struct mfd_cell	*cell = host->pdev->dev.platform_data;
-	struct tmio_mmc_data *pdata = cell->driver_data;
 	unsigned int ireg, irq_mask, status;
-	unsigned int sdio_ireg, sdio_irq_mask, sdio_status;
 
 	pr_debug("MMC IRQ begin\n");
 
@@ -677,29 +348,6 @@ static irqreturn_t tmio_mmc_irq(int irq, void *devid)
 	irq_mask = sd_ctrl_read32(host, CTL_IRQ_MASK);
 	ireg = status & TMIO_MASK_IRQ & ~irq_mask;
 
-	sdio_ireg = 0;
-	if (!ireg && pdata->flags & TMIO_MMC_SDIO_IRQ) {
-		sdio_status = sd_ctrl_read16(host, CTL_SDIO_STATUS);
-		sdio_irq_mask = sd_ctrl_read16(host, CTL_SDIO_IRQ_MASK);
-		sdio_ireg = sdio_status & TMIO_SDIO_MASK_ALL & ~sdio_irq_mask;
-
-		sd_ctrl_write16(host, CTL_SDIO_STATUS, sdio_status & ~TMIO_SDIO_MASK_ALL);
-
-		if (sdio_ireg && !host->sdio_irq_enabled) {
-			pr_warning("tmio_mmc: Spurious SDIO IRQ, disabling! 0x%04x 0x%04x 0x%04x\n",
-				   sdio_status, sdio_irq_mask, sdio_ireg);
-			tmio_mmc_enable_sdio_irq(host->mmc, 0);
-			goto out;
-		}
-
-		if (host->mmc->caps & MMC_CAP_SDIO_IRQ &&
-			sdio_ireg & TMIO_SDIO_STAT_IOIRQ)
-			mmc_signal_sdio_irq(host->mmc);
-
-		if (sdio_ireg)
-			goto out;
-	}
-
 	pr_debug_status(status);
 	pr_debug_status(ireg);
 
@@ -727,10 +375,8 @@ static irqreturn_t tmio_mmc_irq(int irq, void *devid)
  */
 
 		/* Command completion */
-		if (ireg & (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT)) {
-			ack_mmc_irqs(host,
-				     TMIO_STAT_CMDRESPEND |
-				     TMIO_STAT_CMDTIMEOUT);
+		if (ireg & TMIO_MASK_CMD) {
+			ack_mmc_irqs(host, TMIO_MASK_CMD);
 			tmio_mmc_cmd_irq(host, status);
 		}
 
@@ -761,16 +407,6 @@ static irqreturn_t tmio_mmc_irq(int irq, void *devid)
 }
 
 #ifdef CONFIG_TMIO_MMC_DMA
-static void tmio_check_bounce_buffer(struct tmio_mmc_host *host)
-{
-	if (host->sg_ptr == &host->bounce_sg) {
-		unsigned long flags;
-		void *sg_vaddr = tmio_mmc_kmap_atomic(host->sg_orig, &flags);
-		memcpy(sg_vaddr, host->bounce_buf, host->bounce_sg.length);
-		tmio_mmc_kunmap_atomic(sg_vaddr, &flags);
-	}
-}
-
 static void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable)
 {
 #if defined(CONFIG_SUPERH) || defined(CONFIG_ARCH_SHMOBILE)
@@ -791,39 +427,12 @@ static void tmio_dma_complete(void *arg)
 		enable_mmc_irqs(host, TMIO_STAT_DATAEND);
 }
 
-static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
+static int tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 {
-	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
+	struct scatterlist *sg = host->sg_ptr;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_rx;
-	struct mfd_cell	*cell = host->pdev->dev.platform_data;
-	struct tmio_mmc_data *pdata = cell->driver_data;
-	dma_cookie_t cookie;
-	int ret, i;
-	bool aligned = true, multiple = true;
-	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
-
-	for_each_sg(sg, sg_tmp, host->sg_len, i) {
-		if (sg_tmp->offset & align)
-			aligned = false;
-		if (sg_tmp->length & align) {
-			multiple = false;
-			break;
-		}
-	}
-
-	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
-			  align >= MAX_ALIGN)) || !multiple) {
-		ret = -EINVAL;
-		goto pio;
-	}
-
-	/* The only sg element can be unaligned, use our bounce buffer then */
-	if (!aligned) {
-		sg_init_one(&host->bounce_sg, host->bounce_buf, sg->length);
-		host->sg_ptr = &host->bounce_sg;
-		sg = host->sg_ptr;
-	}
+	int ret;
 
 	ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, DMA_FROM_DEVICE);
 	if (ret > 0) {
@@ -833,21 +442,21 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 	}
 
 	if (desc) {
+		host->desc = desc;
 		desc->callback = tmio_dma_complete;
 		desc->callback_param = host;
-		cookie = desc->tx_submit(desc);
-		if (cookie < 0) {
-			desc = NULL;
-			ret = cookie;
+		host->cookie = desc->tx_submit(desc);
+		if (host->cookie < 0) {
+			host->desc = NULL;
+			ret = host->cookie;
 		} else {
 			chan->device->device_issue_pending(chan);
 		}
 	}
 	dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
-		__func__, host->sg_len, ret, cookie, host->mrq);
+		__func__, host->sg_len, ret, host->cookie, host->mrq);
 
-pio:
-	if (!desc) {
+	if (!host->desc) {
 		/* DMA failed, fall back to PIO */
 		if (ret >= 0)
 			ret = -EIO;
@@ -862,49 +471,24 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 		dev_warn(&host->pdev->dev,
 			 "DMA failed: %d, falling back to PIO\n", ret);
 		tmio_mmc_enable_dma(host, false);
+		reset(host);
+		/* Fail this request, let above layers recover */
+		host->mrq->cmd->error = ret;
+		tmio_mmc_finish_request(host);
 	}
 
 	dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d, sg[%d]\n", __func__,
-		desc, cookie, host->sg_len);
+		desc, host->cookie, host->sg_len);
+
+	return ret > 0 ? 0 : ret;
 }
 
-static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
+static int tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 {
-	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
+	struct scatterlist *sg = host->sg_ptr;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_tx;
-	struct mfd_cell	*cell = host->pdev->dev.platform_data;
-	struct tmio_mmc_data *pdata = cell->driver_data;
-	dma_cookie_t cookie;
-	int ret, i;
-	bool aligned = true, multiple = true;
-	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
-
-	for_each_sg(sg, sg_tmp, host->sg_len, i) {
-		if (sg_tmp->offset & align)
-			aligned = false;
-		if (sg_tmp->length & align) {
-			multiple = false;
-			break;
-		}
-	}
-
-	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
-			  align >= MAX_ALIGN)) || !multiple) {
-		ret = -EINVAL;
-		goto pio;
-	}
-
-	/* The only sg element can be unaligned, use our bounce buffer then */
-	if (!aligned) {
-		unsigned long flags;
-		void *sg_vaddr = tmio_mmc_kmap_atomic(sg, &flags);
-		sg_init_one(&host->bounce_sg, host->bounce_buf, sg->length);
-		memcpy(host->bounce_buf, sg_vaddr, host->bounce_sg.length);
-		tmio_mmc_kunmap_atomic(sg_vaddr, &flags);
-		host->sg_ptr = &host->bounce_sg;
-		sg = host->sg_ptr;
-	}
+	int ret;
 
 	ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, DMA_TO_DEVICE);
 	if (ret > 0) {
@@ -914,19 +498,19 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 	}
 
 	if (desc) {
+		host->desc = desc;
 		desc->callback = tmio_dma_complete;
 		desc->callback_param = host;
-		cookie = desc->tx_submit(desc);
-		if (cookie < 0) {
-			desc = NULL;
-			ret = cookie;
+		host->cookie = desc->tx_submit(desc);
+		if (host->cookie < 0) {
+			host->desc = NULL;
+			ret = host->cookie;
 		}
 	}
 	dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
-		__func__, host->sg_len, ret, cookie, host->mrq);
+		__func__, host->sg_len, ret, host->cookie, host->mrq);
 
-pio:
-	if (!desc) {
+	if (!host->desc) {
 		/* DMA failed, fall back to PIO */
 		if (ret >= 0)
 			ret = -EIO;
@@ -941,22 +525,30 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 		dev_warn(&host->pdev->dev,
 			 "DMA failed: %d, falling back to PIO\n", ret);
 		tmio_mmc_enable_dma(host, false);
+		reset(host);
+		/* Fail this request, let above layers recover */
+		host->mrq->cmd->error = ret;
+		tmio_mmc_finish_request(host);
 	}
 
 	dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d\n", __func__,
-		desc, cookie);
+		desc, host->cookie);
+
+	return ret > 0 ? 0 : ret;
 }
 
-static void tmio_mmc_start_dma(struct tmio_mmc_host *host,
+static int tmio_mmc_start_dma(struct tmio_mmc_host *host,
 			       struct mmc_data *data)
 {
 	if (data->flags & MMC_DATA_READ) {
 		if (host->chan_rx)
-			tmio_mmc_start_dma_rx(host);
+			return tmio_mmc_start_dma_rx(host);
 	} else {
 		if (host->chan_tx)
-			tmio_mmc_start_dma_tx(host);
+			return tmio_mmc_start_dma_tx(host);
 	}
+
+	return 0;
 }
 
 static void tmio_issue_tasklet_fn(unsigned long priv)
@@ -970,12 +562,6 @@ static void tmio_issue_tasklet_fn(unsigned long priv)
 static void tmio_tasklet_fn(unsigned long arg)
 {
 	struct tmio_mmc_host *host = (struct tmio_mmc_host *)arg;
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->lock, flags);
-
-	if (!host->data)
-		goto out;
 
 	if (host->data->flags & MMC_DATA_READ)
 		dma_unmap_sg(&host->pdev->dev, host->sg_ptr, host->dma_sglen,
@@ -985,8 +571,6 @@ static void tmio_tasklet_fn(unsigned long arg)
 			     DMA_TO_DEVICE);
 
 	tmio_mmc_do_data_irq(host);
-out:
-	spin_unlock_irqrestore(&host->lock, flags);
 }
 
 /* It might be necessary to make filter MFD specific */
@@ -1000,6 +584,9 @@ static bool tmio_mmc_filter(struct dma_chan *chan, void *arg)
 static void tmio_mmc_request_dma(struct tmio_mmc_host *host,
 				 struct tmio_mmc_data *pdata)
 {
+	host->cookie = -EINVAL;
+	host->desc = NULL;
+
 	/* We can only either use DMA for both Tx and Rx or not use it at all */
 	if (pdata->dma) {
 		dma_cap_mask_t mask;
@@ -1045,15 +632,15 @@ static void tmio_mmc_release_dma(struct tmio_mmc_host *host)
 		host->chan_rx = NULL;
 		dma_release_channel(chan);
 	}
+
+	host->cookie = -EINVAL;
+	host->desc = NULL;
 }
 #else
-static void tmio_check_bounce_buffer(struct tmio_mmc_host *host)
-{
-}
-
-static void tmio_mmc_start_dma(struct tmio_mmc_host *host,
+static int tmio_mmc_start_dma(struct tmio_mmc_host *host,
 			       struct mmc_data *data)
 {
+	return 0;
 }
 
 static void tmio_mmc_request_dma(struct tmio_mmc_host *host,
@@ -1095,9 +682,7 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host,
 	sd_ctrl_write16(host, CTL_SD_XFER_LEN, data->blksz);
 	sd_ctrl_write16(host, CTL_XFER_BLK_COUNT, data->blocks);
 
-	tmio_mmc_start_dma(host, data);
-
-	return 0;
+	return tmio_mmc_start_dma(host, data);
 }
 
 /* Process requests from the MMC layer */
@@ -1109,8 +694,6 @@ static void tmio_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
 	if (host->mrq)
 		pr_debug("request not null\n");
 
-	host->last_req_ts = jiffies;
-	wmb();
 	host->mrq = mrq;
 
 	if (mrq->data) {
@@ -1120,14 +703,10 @@ static void tmio_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
 	}
 
 	ret = tmio_mmc_start_command(host, mrq->cmd);
-	if (!ret) {
-		schedule_delayed_work(&host->delayed_reset_work,
-				      msecs_to_jiffies(2000));
+	if (!ret)
 		return;
-	}
 
 fail:
-	host->mrq = NULL;
 	mrq->cmd->error = ret;
 	mmc_request_done(mmc, mrq);
 }
@@ -1201,7 +780,6 @@ static const struct mmc_host_ops tmio_mmc_ops = {
 	.set_ios	= tmio_mmc_set_ios,
 	.get_ro         = tmio_mmc_get_ro,
 	.get_cd		= tmio_mmc_get_cd,
-	.enable_sdio_irq = tmio_mmc_enable_sdio_irq,
 };
 
 #ifdef CONFIG_PM
@@ -1286,15 +864,10 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
 		goto host_free;
 
 	mmc->ops = &tmio_mmc_ops;
-	mmc->caps = MMC_CAP_4_BIT_DATA | pdata->capabilities;
+	mmc->caps = MMC_CAP_4_BIT_DATA;
+	mmc->caps |= pdata->capabilities;
 	mmc->f_max = pdata->hclk;
 	mmc->f_min = mmc->f_max / 512;
-	mmc->max_segs = 32;
-	mmc->max_blk_size = 512;
-	mmc->max_blk_count = (PAGE_CACHE_SIZE / mmc->max_blk_size) *
-		mmc->max_segs;
-	mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
-	mmc->max_seg_size = mmc->max_req_size;
 	if (pdata->ocr_mask)
 		mmc->ocr_avail = pdata->ocr_mask;
 	else
@@ -1317,19 +890,12 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
 		goto cell_disable;
 
 	disable_mmc_irqs(host, TMIO_MASK_ALL);
-	if (pdata->flags & TMIO_MMC_SDIO_IRQ)
-		tmio_mmc_enable_sdio_irq(mmc, 0);
 
 	ret = request_irq(host->irq, tmio_mmc_irq, IRQF_DISABLED |
 		IRQF_TRIGGER_FALLING, dev_name(&dev->dev), host);
 	if (ret)
 		goto cell_disable;
 
-	spin_lock_init(&host->lock);
-
-	/* Init delayed work for request timeouts */
-	INIT_DELAYED_WORK(&host->delayed_reset_work, tmio_mmc_reset_work);
-
 	/* See if we also get DMA */
 	tmio_mmc_request_dma(host, pdata);
 
@@ -1368,7 +934,6 @@ static int __devexit tmio_mmc_remove(struct platform_device *dev)
 	if (mmc) {
 		struct tmio_mmc_host *host = mmc_priv(mmc);
 		mmc_remove_host(mmc);
-		cancel_delayed_work_sync(&host->delayed_reset_work);
 		tmio_mmc_release_dma(host);
 		free_irq(host->irq, host);
 		if (cell->disable)
diff --git a/trunk/drivers/mmc/host/tmio_mmc.h b/trunk/drivers/mmc/host/tmio_mmc.h
new file mode 100644
index 000000000000..0fedc78e3ea5
--- /dev/null
+++ b/trunk/drivers/mmc/host/tmio_mmc.h
@@ -0,0 +1,228 @@
+/* Definitons for use with the tmio_mmc.c
+ *
+ * (c) 2004 Ian Molton <spyro@f2s.com>
+ * (c) 2007 Ian Molton <spyro@f2s.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+
+#define CTL_SD_CMD 0x00
+#define CTL_ARG_REG 0x04
+#define CTL_STOP_INTERNAL_ACTION 0x08
+#define CTL_XFER_BLK_COUNT 0xa
+#define CTL_RESPONSE 0x0c
+#define CTL_STATUS 0x1c
+#define CTL_IRQ_MASK 0x20
+#define CTL_SD_CARD_CLK_CTL 0x24
+#define CTL_SD_XFER_LEN 0x26
+#define CTL_SD_MEM_CARD_OPT 0x28
+#define CTL_SD_ERROR_DETAIL_STATUS 0x2c
+#define CTL_SD_DATA_PORT 0x30
+#define CTL_TRANSACTION_CTL 0x34
+#define CTL_RESET_SD 0xe0
+#define CTL_SDIO_REGS 0x100
+#define CTL_CLK_AND_WAIT_CTL 0x138
+#define CTL_RESET_SDIO 0x1e0
+
+/* Definitions for values the CTRL_STATUS register can take. */
+#define TMIO_STAT_CMDRESPEND    0x00000001
+#define TMIO_STAT_DATAEND       0x00000004
+#define TMIO_STAT_CARD_REMOVE   0x00000008
+#define TMIO_STAT_CARD_INSERT   0x00000010
+#define TMIO_STAT_SIGSTATE      0x00000020
+#define TMIO_STAT_WRPROTECT     0x00000080
+#define TMIO_STAT_CARD_REMOVE_A 0x00000100
+#define TMIO_STAT_CARD_INSERT_A 0x00000200
+#define TMIO_STAT_SIGSTATE_A    0x00000400
+#define TMIO_STAT_CMD_IDX_ERR   0x00010000
+#define TMIO_STAT_CRCFAIL       0x00020000
+#define TMIO_STAT_STOPBIT_ERR   0x00040000
+#define TMIO_STAT_DATATIMEOUT   0x00080000
+#define TMIO_STAT_RXOVERFLOW    0x00100000
+#define TMIO_STAT_TXUNDERRUN    0x00200000
+#define TMIO_STAT_CMDTIMEOUT    0x00400000
+#define TMIO_STAT_RXRDY         0x01000000
+#define TMIO_STAT_TXRQ          0x02000000
+#define TMIO_STAT_ILL_FUNC      0x20000000
+#define TMIO_STAT_CMD_BUSY      0x40000000
+#define TMIO_STAT_ILL_ACCESS    0x80000000
+
+/* Define some IRQ masks */
+/* This is the mask used at reset by the chip */
+#define TMIO_MASK_ALL           0x837f031d
+#define TMIO_MASK_READOP  (TMIO_STAT_RXRDY | TMIO_STAT_DATAEND)
+#define TMIO_MASK_WRITEOP (TMIO_STAT_TXRQ | TMIO_STAT_DATAEND)
+#define TMIO_MASK_CMD     (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT | \
+		TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT)
+#define TMIO_MASK_IRQ     (TMIO_MASK_READOP | TMIO_MASK_WRITEOP | TMIO_MASK_CMD)
+
+
+#define enable_mmc_irqs(host, i) \
+	do { \
+		u32 mask;\
+		mask  = sd_ctrl_read32((host), CTL_IRQ_MASK); \
+		mask &= ~((i) & TMIO_MASK_IRQ); \
+		sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
+	} while (0)
+
+#define disable_mmc_irqs(host, i) \
+	do { \
+		u32 mask;\
+		mask  = sd_ctrl_read32((host), CTL_IRQ_MASK); \
+		mask |= ((i) & TMIO_MASK_IRQ); \
+		sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
+	} while (0)
+
+#define ack_mmc_irqs(host, i) \
+	do { \
+		sd_ctrl_write32((host), CTL_STATUS, ~(i)); \
+	} while (0)
+
+
+struct tmio_mmc_host {
+	void __iomem *ctl;
+	unsigned long bus_shift;
+	struct mmc_command      *cmd;
+	struct mmc_request      *mrq;
+	struct mmc_data         *data;
+	struct mmc_host         *mmc;
+	int                     irq;
+
+	/* Callbacks for clock / power control */
+	void (*set_pwr)(struct platform_device *host, int state);
+	void (*set_clk_div)(struct platform_device *host, int state);
+
+	/* pio related stuff */
+	struct scatterlist      *sg_ptr;
+	unsigned int            sg_len;
+	unsigned int            sg_off;
+
+	struct platform_device *pdev;
+
+	/* DMA support */
+	struct dma_chan		*chan_rx;
+	struct dma_chan		*chan_tx;
+	struct tasklet_struct	dma_complete;
+	struct tasklet_struct	dma_issue;
+#ifdef CONFIG_TMIO_MMC_DMA
+	struct dma_async_tx_descriptor *desc;
+	unsigned int            dma_sglen;
+	dma_cookie_t		cookie;
+#endif
+};
+
+#include <linux/io.h>
+
+static inline u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
+{
+	return readw(host->ctl + (addr << host->bus_shift));
+}
+
+static inline void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr,
+		u16 *buf, int count)
+{
+	readsw(host->ctl + (addr << host->bus_shift), buf, count);
+}
+
+static inline u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
+{
+	return readw(host->ctl + (addr << host->bus_shift)) |
+	       readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16;
+}
+
+static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr,
+		u16 val)
+{
+	writew(val, host->ctl + (addr << host->bus_shift));
+}
+
+static inline void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr,
+		u16 *buf, int count)
+{
+	writesw(host->ctl + (addr << host->bus_shift), buf, count);
+}
+
+static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr,
+		u32 val)
+{
+	writew(val, host->ctl + (addr << host->bus_shift));
+	writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift));
+}
+
+#include <linux/scatterlist.h>
+#include <linux/blkdev.h>
+
+static inline void tmio_mmc_init_sg(struct tmio_mmc_host *host,
+	struct mmc_data *data)
+{
+	host->sg_len = data->sg_len;
+	host->sg_ptr = data->sg;
+	host->sg_off = 0;
+}
+
+static inline int tmio_mmc_next_sg(struct tmio_mmc_host *host)
+{
+	host->sg_ptr = sg_next(host->sg_ptr);
+	host->sg_off = 0;
+	return --host->sg_len;
+}
+
+static inline char *tmio_mmc_kmap_atomic(struct scatterlist *sg,
+	unsigned long *flags)
+{
+	local_irq_save(*flags);
+	return kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
+}
+
+static inline void tmio_mmc_kunmap_atomic(void *virt,
+	unsigned long *flags)
+{
+	kunmap_atomic(virt, KM_BIO_SRC_IRQ);
+	local_irq_restore(*flags);
+}
+
+#ifdef CONFIG_MMC_DEBUG
+
+#define STATUS_TO_TEXT(a) \
+	do { \
+		if (status & TMIO_STAT_##a) \
+			printk(#a); \
+	} while (0)
+
+void pr_debug_status(u32 status)
+{
+	printk(KERN_DEBUG "status: %08x = ", status);
+	STATUS_TO_TEXT(CARD_REMOVE);
+	STATUS_TO_TEXT(CARD_INSERT);
+	STATUS_TO_TEXT(SIGSTATE);
+	STATUS_TO_TEXT(WRPROTECT);
+	STATUS_TO_TEXT(CARD_REMOVE_A);
+	STATUS_TO_TEXT(CARD_INSERT_A);
+	STATUS_TO_TEXT(SIGSTATE_A);
+	STATUS_TO_TEXT(CMD_IDX_ERR);
+	STATUS_TO_TEXT(STOPBIT_ERR);
+	STATUS_TO_TEXT(ILL_FUNC);
+	STATUS_TO_TEXT(CMD_BUSY);
+	STATUS_TO_TEXT(CMDRESPEND);
+	STATUS_TO_TEXT(DATAEND);
+	STATUS_TO_TEXT(CRCFAIL);
+	STATUS_TO_TEXT(DATATIMEOUT);
+	STATUS_TO_TEXT(CMDTIMEOUT);
+	STATUS_TO_TEXT(RXOVERFLOW);
+	STATUS_TO_TEXT(TXUNDERRUN);
+	STATUS_TO_TEXT(RXRDY);
+	STATUS_TO_TEXT(TXRQ);
+	STATUS_TO_TEXT(ILL_ACCESS);
+	printk("\n");
+}
+
+#else
+#define pr_debug_status(s)  do { } while (0)
+#endif
diff --git a/trunk/drivers/rtc/class.c b/trunk/drivers/rtc/class.c
index 9583cbcc6b79..e6539cbabb35 100644
--- a/trunk/drivers/rtc/class.c
+++ b/trunk/drivers/rtc/class.c
@@ -16,7 +16,6 @@
 #include <linux/kdev_t.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
-#include <linux/workqueue.h>
 
 #include "rtc-core.h"
 
@@ -153,18 +152,6 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 	spin_lock_init(&rtc->irq_task_lock);
 	init_waitqueue_head(&rtc->irq_queue);
 
-	/* Init timerqueue */
-	timerqueue_init_head(&rtc->timerqueue);
-	INIT_WORK(&rtc->irqwork, rtc_timer_do_work);
-	/* Init aie timer */
-	rtc_timer_init(&rtc->aie_timer, rtc_aie_update_irq, (void *)rtc);
-	/* Init uie timer */
-	rtc_timer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, (void *)rtc);
-	/* Init pie timer */
-	hrtimer_init(&rtc->pie_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rtc->pie_timer.function = rtc_pie_update_irq;
-	rtc->pie_enabled = 0;
-
 	strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
 	dev_set_name(&rtc->dev, "rtc%d", id);
 
diff --git a/trunk/drivers/rtc/interface.c b/trunk/drivers/rtc/interface.c
index 90384b9f6b2c..a0c816238aa9 100644
--- a/trunk/drivers/rtc/interface.c
+++ b/trunk/drivers/rtc/interface.c
@@ -14,11 +14,15 @@
 #include <linux/rtc.h>
 #include <linux/sched.h>
 #include <linux/log2.h>
-#include <linux/workqueue.h>
 
-static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
+int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
 {
 	int err;
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return err;
+
 	if (!rtc->ops)
 		err = -ENODEV;
 	else if (!rtc->ops->read_time)
@@ -27,18 +31,7 @@ static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
 		memset(tm, 0, sizeof(struct rtc_time));
 		err = rtc->ops->read_time(rtc->dev.parent, tm);
 	}
-	return err;
-}
-
-int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
-{
-	int err;
 
-	err = mutex_lock_interruptible(&rtc->ops_lock);
-	if (err)
-		return err;
-
-	err = __rtc_read_time(rtc, tm);
 	mutex_unlock(&rtc->ops_lock);
 	return err;
 }
@@ -113,54 +106,188 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
 }
 EXPORT_SYMBOL_GPL(rtc_set_mmss);
 
-int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
 	int err;
 
 	err = mutex_lock_interruptible(&rtc->ops_lock);
 	if (err)
 		return err;
-	alarm->enabled = rtc->aie_timer.enabled;
-	if (alarm->enabled)
-		alarm->time = rtc_ktime_to_tm(rtc->aie_timer.node.expires);
-	mutex_unlock(&rtc->ops_lock);
 
-	return 0;
+	if (rtc->ops == NULL)
+		err = -ENODEV;
+	else if (!rtc->ops->read_alarm)
+		err = -EINVAL;
+	else {
+		memset(alarm, 0, sizeof(struct rtc_wkalrm));
+		err = rtc->ops->read_alarm(rtc->dev.parent, alarm);
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
 }
-EXPORT_SYMBOL_GPL(rtc_read_alarm);
 
-int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
-	struct rtc_time tm;
-	long now, scheduled;
 	int err;
+	struct rtc_time before, now;
+	int first_time = 1;
+	unsigned long t_now, t_alm;
+	enum { none, day, month, year } missing = none;
+	unsigned days;
+
+	/* The lower level RTC driver may return -1 in some fields,
+	 * creating invalid alarm->time values, for reasons like:
+	 *
+	 *   - The hardware may not be capable of filling them in;
+	 *     many alarms match only on time-of-day fields, not
+	 *     day/month/year calendar data.
+	 *
+	 *   - Some hardware uses illegal values as "wildcard" match
+	 *     values, which non-Linux firmware (like a BIOS) may try
+	 *     to set up as e.g. "alarm 15 minutes after each hour".
+	 *     Linux uses only oneshot alarms.
+	 *
+	 * When we see that here, we deal with it by using values from
+	 * a current RTC timestamp for any missing (-1) values.  The
+	 * RTC driver prevents "periodic alarm" modes.
+	 *
+	 * But this can be racey, because some fields of the RTC timestamp
+	 * may have wrapped in the interval since we read the RTC alarm,
+	 * which would lead to us inserting inconsistent values in place
+	 * of the -1 fields.
+	 *
+	 * Reading the alarm and timestamp in the reverse sequence
+	 * would have the same race condition, and not solve the issue.
+	 *
+	 * So, we must first read the RTC timestamp,
+	 * then read the RTC alarm value,
+	 * and then read a second RTC timestamp.
+	 *
+	 * If any fields of the second timestamp have changed
+	 * when compared with the first timestamp, then we know
+	 * our timestamp may be inconsistent with that used by
+	 * the low-level rtc_read_alarm_internal() function.
+	 *
+	 * So, when the two timestamps disagree, we just loop and do
+	 * the process again to get a fully consistent set of values.
+	 *
+	 * This could all instead be done in the lower level driver,
+	 * but since more than one lower level RTC implementation needs it,
+	 * then it's probably best best to do it here instead of there..
+	 */
 
-	err = rtc_valid_tm(&alarm->time);
-	if (err)
+	/* Get the "before" timestamp */
+	err = rtc_read_time(rtc, &before);
+	if (err < 0)
 		return err;
-	rtc_tm_to_time(&alarm->time, &scheduled);
-
-	/* Make sure we're not setting alarms in the past */
-	err = __rtc_read_time(rtc, &tm);
-	rtc_tm_to_time(&tm, &now);
-	if (scheduled <= now)
-		return -ETIME;
-	/*
-	 * XXX - We just checked to make sure the alarm time is not
-	 * in the past, but there is still a race window where if
-	 * the is alarm set for the next second and the second ticks
-	 * over right here, before we set the alarm.
+	do {
+		if (!first_time)
+			memcpy(&before, &now, sizeof(struct rtc_time));
+		first_time = 0;
+
+		/* get the RTC alarm values, which may be incomplete */
+		err = rtc_read_alarm_internal(rtc, alarm);
+		if (err)
+			return err;
+		if (!alarm->enabled)
+			return 0;
+
+		/* full-function RTCs won't have such missing fields */
+		if (rtc_valid_tm(&alarm->time) == 0)
+			return 0;
+
+		/* get the "after" timestamp, to detect wrapped fields */
+		err = rtc_read_time(rtc, &now);
+		if (err < 0)
+			return err;
+
+		/* note that tm_sec is a "don't care" value here: */
+	} while (   before.tm_min   != now.tm_min
+		 || before.tm_hour  != now.tm_hour
+		 || before.tm_mon   != now.tm_mon
+		 || before.tm_year  != now.tm_year);
+
+	/* Fill in the missing alarm fields using the timestamp; we
+	 * know there's at least one since alarm->time is invalid.
 	 */
+	if (alarm->time.tm_sec == -1)
+		alarm->time.tm_sec = now.tm_sec;
+	if (alarm->time.tm_min == -1)
+		alarm->time.tm_min = now.tm_min;
+	if (alarm->time.tm_hour == -1)
+		alarm->time.tm_hour = now.tm_hour;
+
+	/* For simplicity, only support date rollover for now */
+	if (alarm->time.tm_mday == -1) {
+		alarm->time.tm_mday = now.tm_mday;
+		missing = day;
+	}
+	if (alarm->time.tm_mon == -1) {
+		alarm->time.tm_mon = now.tm_mon;
+		if (missing == none)
+			missing = month;
+	}
+	if (alarm->time.tm_year == -1) {
+		alarm->time.tm_year = now.tm_year;
+		if (missing == none)
+			missing = year;
+	}
 
-	if (!rtc->ops)
-		err = -ENODEV;
-	else if (!rtc->ops->set_alarm)
-		err = -EINVAL;
-	else
-		err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
+	/* with luck, no rollover is needed */
+	rtc_tm_to_time(&now, &t_now);
+	rtc_tm_to_time(&alarm->time, &t_alm);
+	if (t_now < t_alm)
+		goto done;
 
-	return err;
+	switch (missing) {
+
+	/* 24 hour rollover ... if it's now 10am Monday, an alarm that
+	 * that will trigger at 5am will do so at 5am Tuesday, which
+	 * could also be in the next month or year.  This is a common
+	 * case, especially for PCs.
+	 */
+	case day:
+		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day");
+		t_alm += 24 * 60 * 60;
+		rtc_time_to_tm(t_alm, &alarm->time);
+		break;
+
+	/* Month rollover ... if it's the 31th, an alarm on the 3rd will
+	 * be next month.  An alarm matching on the 30th, 29th, or 28th
+	 * may end up in the month after that!  Many newer PCs support
+	 * this type of alarm.
+	 */
+	case month:
+		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month");
+		do {
+			if (alarm->time.tm_mon < 11)
+				alarm->time.tm_mon++;
+			else {
+				alarm->time.tm_mon = 0;
+				alarm->time.tm_year++;
+			}
+			days = rtc_month_days(alarm->time.tm_mon,
+					alarm->time.tm_year);
+		} while (days < alarm->time.tm_mday);
+		break;
+
+	/* Year rollover ... easy except for leap years! */
+	case year:
+		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year");
+		do {
+			alarm->time.tm_year++;
+		} while (rtc_valid_tm(&alarm->time) != 0);
+		break;
+
+	default:
+		dev_warn(&rtc->dev, "alarm rollover not handled\n");
+	}
+
+done:
+	return 0;
 }
+EXPORT_SYMBOL_GPL(rtc_read_alarm);
 
 int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
@@ -173,18 +300,16 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 	err = mutex_lock_interruptible(&rtc->ops_lock);
 	if (err)
 		return err;
-	if (rtc->aie_timer.enabled) {
-		rtc_timer_remove(rtc, &rtc->aie_timer);
-		rtc->aie_timer.enabled = 0;
-	}
-	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
-	rtc->aie_timer.period = ktime_set(0, 0);
-	if (alarm->enabled) {
-		rtc->aie_timer.enabled = 1;
-		rtc_timer_enqueue(rtc, &rtc->aie_timer);
-	}
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->set_alarm)
+		err = -EINVAL;
+	else
+		err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
+
 	mutex_unlock(&rtc->ops_lock);
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(rtc_set_alarm);
 
@@ -194,16 +319,6 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 	if (err)
 		return err;
 
-	if (rtc->aie_timer.enabled != enabled) {
-		if (enabled) {
-			rtc->aie_timer.enabled = 1;
-			rtc_timer_enqueue(rtc, &rtc->aie_timer);
-		} else {
-			rtc_timer_remove(rtc, &rtc->aie_timer);
-			rtc->aie_timer.enabled = 0;
-		}
-	}
-
 	if (!rtc->ops)
 		err = -ENODEV;
 	else if (!rtc->ops->alarm_irq_enable)
@@ -222,53 +337,52 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 	if (err)
 		return err;
 
-	/* make sure we're changing state */
-	if (rtc->uie_rtctimer.enabled == enabled)
-		goto out;
-
-	if (enabled) {
-		struct rtc_time tm;
-		ktime_t now, onesec;
-
-		__rtc_read_time(rtc, &tm);
-		onesec = ktime_set(1, 0);
-		now = rtc_tm_to_ktime(tm);
-		rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
-		rtc->uie_rtctimer.period = ktime_set(1, 0);
-		rtc->uie_rtctimer.enabled = 1;
-		rtc_timer_enqueue(rtc, &rtc->uie_rtctimer);
-	} else {
-		rtc_timer_remove(rtc, &rtc->uie_rtctimer);
-		rtc->uie_rtctimer.enabled = 0;
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	if (enabled == 0 && rtc->uie_irq_active) {
+		mutex_unlock(&rtc->ops_lock);
+		return rtc_dev_update_irq_enable_emul(rtc, enabled);
 	}
+#endif
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->update_irq_enable)
+		err = -EINVAL;
+	else
+		err = rtc->ops->update_irq_enable(rtc->dev.parent, enabled);
 
-out:
 	mutex_unlock(&rtc->ops_lock);
-	return err;
 
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	/*
+	 * Enable emulation if the driver did not provide
+	 * the update_irq_enable function pointer or if returned
+	 * -EINVAL to signal that it has been configured without
+	 * interrupts or that are not available at the moment.
+	 */
+	if (err == -EINVAL)
+		err = rtc_dev_update_irq_enable_emul(rtc, enabled);
+#endif
+	return err;
 }
 EXPORT_SYMBOL_GPL(rtc_update_irq_enable);
 
-
 /**
- * rtc_handle_legacy_irq - AIE, UIE and PIE event hook
- * @rtc: pointer to the rtc device
- *
- * This function is called when an AIE, UIE or PIE mode interrupt
- * has occured (or been emulated).
- *
- * Triggers the registered irq_task function callback.
+ * rtc_update_irq - report RTC periodic, alarm, and/or update irqs
+ * @rtc: the rtc device
+ * @num: how many irqs are being reported (usually one)
+ * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF
+ * Context: any
  */
-static void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode)
+void rtc_update_irq(struct rtc_device *rtc,
+		unsigned long num, unsigned long events)
 {
 	unsigned long flags;
 
-	/* mark one irq of the appropriate mode */
 	spin_lock_irqsave(&rtc->irq_lock, flags);
-	rtc->irq_data = (rtc->irq_data + (num << 8)) | (RTC_IRQF|mode);
+	rtc->irq_data = (rtc->irq_data + (num << 8)) | events;
 	spin_unlock_irqrestore(&rtc->irq_lock, flags);
 
-	/* call the task func */
 	spin_lock_irqsave(&rtc->irq_task_lock, flags);
 	if (rtc->irq_task)
 		rtc->irq_task->func(rtc->irq_task->private_data);
@@ -277,69 +391,6 @@ static void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode)
 	wake_up_interruptible(&rtc->irq_queue);
 	kill_fasync(&rtc->async_queue, SIGIO, POLL_IN);
 }
-
-
-/**
- * rtc_aie_update_irq - AIE mode rtctimer hook
- * @private: pointer to the rtc_device
- *
- * This functions is called when the aie_timer expires.
- */
-void rtc_aie_update_irq(void *private)
-{
-	struct rtc_device *rtc = (struct rtc_device *)private;
-	rtc_handle_legacy_irq(rtc, 1, RTC_AF);
-}
-
-
-/**
- * rtc_uie_update_irq - UIE mode rtctimer hook
- * @private: pointer to the rtc_device
- *
- * This functions is called when the uie_timer expires.
- */
-void rtc_uie_update_irq(void *private)
-{
-	struct rtc_device *rtc = (struct rtc_device *)private;
-	rtc_handle_legacy_irq(rtc, 1,  RTC_UF);
-}
-
-
-/**
- * rtc_pie_update_irq - PIE mode hrtimer hook
- * @timer: pointer to the pie mode hrtimer
- *
- * This function is used to emulate PIE mode interrupts
- * using an hrtimer. This function is called when the periodic
- * hrtimer expires.
- */
-enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer)
-{
-	struct rtc_device *rtc;
-	ktime_t period;
-	int count;
-	rtc = container_of(timer, struct rtc_device, pie_timer);
-
-	period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
-	count = hrtimer_forward_now(timer, period);
-
-	rtc_handle_legacy_irq(rtc, count, RTC_PF);
-
-	return HRTIMER_RESTART;
-}
-
-/**
- * rtc_update_irq - Triggered when a RTC interrupt occurs.
- * @rtc: the rtc device
- * @num: how many irqs are being reported (usually one)
- * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF
- * Context: any
- */
-void rtc_update_irq(struct rtc_device *rtc,
-		unsigned long num, unsigned long events)
-{
-	schedule_work(&rtc->irqwork);
-}
 EXPORT_SYMBOL_GPL(rtc_update_irq);
 
 static int __rtc_match(struct device *dev, void *data)
@@ -426,21 +477,19 @@ int rtc_irq_set_state(struct rtc_device *rtc, struct rtc_task *task, int enabled
 	int err = 0;
 	unsigned long flags;
 
+	if (rtc->ops->irq_set_state == NULL)
+		return -ENXIO;
+
 	spin_lock_irqsave(&rtc->irq_task_lock, flags);
 	if (rtc->irq_task != NULL && task == NULL)
 		err = -EBUSY;
 	if (rtc->irq_task != task)
 		err = -EACCES;
-
-	if (enabled) {
-		ktime_t period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
-		hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL);
-	} else {
-		hrtimer_cancel(&rtc->pie_timer);
-	}
-	rtc->pie_enabled = enabled;
 	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
 
+	if (err == 0)
+		err = rtc->ops->irq_set_state(rtc->dev.parent, enabled);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(rtc_irq_set_state);
@@ -460,194 +509,21 @@ int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq)
 	int err = 0;
 	unsigned long flags;
 
+	if (rtc->ops->irq_set_freq == NULL)
+		return -ENXIO;
+
 	spin_lock_irqsave(&rtc->irq_task_lock, flags);
 	if (rtc->irq_task != NULL && task == NULL)
 		err = -EBUSY;
 	if (rtc->irq_task != task)
 		err = -EACCES;
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
+
 	if (err == 0) {
-		rtc->irq_freq = freq;
-		if (rtc->pie_enabled) {
-			ktime_t period;
-			hrtimer_cancel(&rtc->pie_timer);
-			period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
-			hrtimer_start(&rtc->pie_timer, period,
-					HRTIMER_MODE_REL);
-		}
+		err = rtc->ops->irq_set_freq(rtc->dev.parent, freq);
+		if (err == 0)
+			rtc->irq_freq = freq;
 	}
-	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
 	return err;
 }
 EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
-
-/**
- * rtc_timer_enqueue - Adds a rtc_timer to the rtc_device timerqueue
- * @rtc rtc device
- * @timer timer being added.
- *
- * Enqueues a timer onto the rtc devices timerqueue and sets
- * the next alarm event appropriately.
- *
- * Must hold ops_lock for proper serialization of timerqueue
- */
-void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
-{
-	timerqueue_add(&rtc->timerqueue, &timer->node);
-	if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) {
-		struct rtc_wkalrm alarm;
-		int err;
-		alarm.time = rtc_ktime_to_tm(timer->node.expires);
-		alarm.enabled = 1;
-		err = __rtc_set_alarm(rtc, &alarm);
-		if (err == -ETIME)
-			schedule_work(&rtc->irqwork);
-	}
-}
-
-/**
- * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue
- * @rtc rtc device
- * @timer timer being removed.
- *
- * Removes a timer onto the rtc devices timerqueue and sets
- * the next alarm event appropriately.
- *
- * Must hold ops_lock for proper serialization of timerqueue
- */
-void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
-{
-	struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
-	timerqueue_del(&rtc->timerqueue, &timer->node);
-
-	if (next == &timer->node) {
-		struct rtc_wkalrm alarm;
-		int err;
-		next = timerqueue_getnext(&rtc->timerqueue);
-		if (!next)
-			return;
-		alarm.time = rtc_ktime_to_tm(next->expires);
-		alarm.enabled = 1;
-		err = __rtc_set_alarm(rtc, &alarm);
-		if (err == -ETIME)
-			schedule_work(&rtc->irqwork);
-	}
-}
-
-/**
- * rtc_timer_do_work - Expires rtc timers
- * @rtc rtc device
- * @timer timer being removed.
- *
- * Expires rtc timers. Reprograms next alarm event if needed.
- * Called via worktask.
- *
- * Serializes access to timerqueue via ops_lock mutex
- */
-void rtc_timer_do_work(struct work_struct *work)
-{
-	struct rtc_timer *timer;
-	struct timerqueue_node *next;
-	ktime_t now;
-	struct rtc_time tm;
-
-	struct rtc_device *rtc =
-		container_of(work, struct rtc_device, irqwork);
-
-	mutex_lock(&rtc->ops_lock);
-again:
-	__rtc_read_time(rtc, &tm);
-	now = rtc_tm_to_ktime(tm);
-	while ((next = timerqueue_getnext(&rtc->timerqueue))) {
-		if (next->expires.tv64 > now.tv64)
-			break;
-
-		/* expire timer */
-		timer = container_of(next, struct rtc_timer, node);
-		timerqueue_del(&rtc->timerqueue, &timer->node);
-		timer->enabled = 0;
-		if (timer->task.func)
-			timer->task.func(timer->task.private_data);
-
-		/* Re-add/fwd periodic timers */
-		if (ktime_to_ns(timer->period)) {
-			timer->node.expires = ktime_add(timer->node.expires,
-							timer->period);
-			timer->enabled = 1;
-			timerqueue_add(&rtc->timerqueue, &timer->node);
-		}
-	}
-
-	/* Set next alarm */
-	if (next) {
-		struct rtc_wkalrm alarm;
-		int err;
-		alarm.time = rtc_ktime_to_tm(next->expires);
-		alarm.enabled = 1;
-		err = __rtc_set_alarm(rtc, &alarm);
-		if (err == -ETIME)
-			goto again;
-	}
-
-	mutex_unlock(&rtc->ops_lock);
-}
-
-
-/* rtc_timer_init - Initializes an rtc_timer
- * @timer: timer to be intiialized
- * @f: function pointer to be called when timer fires
- * @data: private data passed to function pointer
- *
- * Kernel interface to initializing an rtc_timer.
- */
-void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
-{
-	timerqueue_init(&timer->node);
-	timer->enabled = 0;
-	timer->task.func = f;
-	timer->task.private_data = data;
-}
-
-/* rtc_timer_start - Sets an rtc_timer to fire in the future
- * @ rtc: rtc device to be used
- * @ timer: timer being set
- * @ expires: time at which to expire the timer
- * @ period: period that the timer will recur
- *
- * Kernel interface to set an rtc_timer
- */
-int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
-			ktime_t expires, ktime_t period)
-{
-	int ret = 0;
-	mutex_lock(&rtc->ops_lock);
-	if (timer->enabled)
-		rtc_timer_remove(rtc, timer);
-
-	timer->node.expires = expires;
-	timer->period = period;
-
-	timer->enabled = 1;
-	rtc_timer_enqueue(rtc, timer);
-
-	mutex_unlock(&rtc->ops_lock);
-	return ret;
-}
-
-/* rtc_timer_cancel - Stops an rtc_timer
- * @ rtc: rtc device to be used
- * @ timer: timer being set
- *
- * Kernel interface to cancel an rtc_timer
- */
-int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer)
-{
-	int ret = 0;
-	mutex_lock(&rtc->ops_lock);
-	if (timer->enabled)
-		rtc_timer_remove(rtc, timer);
-	timer->enabled = 0;
-	mutex_unlock(&rtc->ops_lock);
-	return ret;
-}
-
-
diff --git a/trunk/drivers/rtc/rtc-dev.c b/trunk/drivers/rtc/rtc-dev.c
index 212b16edafc0..0cc0984d155b 100644
--- a/trunk/drivers/rtc/rtc-dev.c
+++ b/trunk/drivers/rtc/rtc-dev.c
@@ -46,6 +46,105 @@ static int rtc_dev_open(struct inode *inode, struct file *file)
 	return err;
 }
 
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+/*
+ * Routine to poll RTC seconds field for change as often as possible,
+ * after first RTC_UIE use timer to reduce polling
+ */
+static void rtc_uie_task(struct work_struct *work)
+{
+	struct rtc_device *rtc =
+		container_of(work, struct rtc_device, uie_task);
+	struct rtc_time tm;
+	int num = 0;
+	int err;
+
+	err = rtc_read_time(rtc, &tm);
+
+	spin_lock_irq(&rtc->irq_lock);
+	if (rtc->stop_uie_polling || err) {
+		rtc->uie_task_active = 0;
+	} else if (rtc->oldsecs != tm.tm_sec) {
+		num = (tm.tm_sec + 60 - rtc->oldsecs) % 60;
+		rtc->oldsecs = tm.tm_sec;
+		rtc->uie_timer.expires = jiffies + HZ - (HZ/10);
+		rtc->uie_timer_active = 1;
+		rtc->uie_task_active = 0;
+		add_timer(&rtc->uie_timer);
+	} else if (schedule_work(&rtc->uie_task) == 0) {
+		rtc->uie_task_active = 0;
+	}
+	spin_unlock_irq(&rtc->irq_lock);
+	if (num)
+		rtc_update_irq(rtc, num, RTC_UF | RTC_IRQF);
+}
+static void rtc_uie_timer(unsigned long data)
+{
+	struct rtc_device *rtc = (struct rtc_device *)data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc->irq_lock, flags);
+	rtc->uie_timer_active = 0;
+	rtc->uie_task_active = 1;
+	if ((schedule_work(&rtc->uie_task) == 0))
+		rtc->uie_task_active = 0;
+	spin_unlock_irqrestore(&rtc->irq_lock, flags);
+}
+
+static int clear_uie(struct rtc_device *rtc)
+{
+	spin_lock_irq(&rtc->irq_lock);
+	if (rtc->uie_irq_active) {
+		rtc->stop_uie_polling = 1;
+		if (rtc->uie_timer_active) {
+			spin_unlock_irq(&rtc->irq_lock);
+			del_timer_sync(&rtc->uie_timer);
+			spin_lock_irq(&rtc->irq_lock);
+			rtc->uie_timer_active = 0;
+		}
+		if (rtc->uie_task_active) {
+			spin_unlock_irq(&rtc->irq_lock);
+			flush_work_sync(&rtc->uie_task);
+			spin_lock_irq(&rtc->irq_lock);
+		}
+		rtc->uie_irq_active = 0;
+	}
+	spin_unlock_irq(&rtc->irq_lock);
+	return 0;
+}
+
+static int set_uie(struct rtc_device *rtc)
+{
+	struct rtc_time tm;
+	int err;
+
+	err = rtc_read_time(rtc, &tm);
+	if (err)
+		return err;
+	spin_lock_irq(&rtc->irq_lock);
+	if (!rtc->uie_irq_active) {
+		rtc->uie_irq_active = 1;
+		rtc->stop_uie_polling = 0;
+		rtc->oldsecs = tm.tm_sec;
+		rtc->uie_task_active = 1;
+		if (schedule_work(&rtc->uie_task) == 0)
+			rtc->uie_task_active = 0;
+	}
+	rtc->irq_data = 0;
+	spin_unlock_irq(&rtc->irq_lock);
+	return 0;
+}
+
+int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, unsigned int enabled)
+{
+	if (enabled)
+		return set_uie(rtc);
+	else
+		return clear_uie(rtc);
+}
+EXPORT_SYMBOL(rtc_dev_update_irq_enable_emul);
+
+#endif /* CONFIG_RTC_INTF_DEV_UIE_EMUL */
 
 static ssize_t
 rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
@@ -394,6 +493,11 @@ void rtc_dev_prepare(struct rtc_device *rtc)
 
 	rtc->dev.devt = MKDEV(MAJOR(rtc_devt), rtc->id);
 
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	INIT_WORK(&rtc->uie_task, rtc_uie_task);
+	setup_timer(&rtc->uie_timer, rtc_uie_timer, (unsigned long)rtc);
+#endif
+
 	cdev_init(&rtc->char_dev, &rtc_dev_fops);
 	rtc->char_dev.owner = rtc->owner;
 }
diff --git a/trunk/drivers/rtc/rtc-lib.c b/trunk/drivers/rtc/rtc-lib.c
index 075f1708deae..773851f338b8 100644
--- a/trunk/drivers/rtc/rtc-lib.c
+++ b/trunk/drivers/rtc/rtc-lib.c
@@ -117,32 +117,4 @@ int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time)
 }
 EXPORT_SYMBOL(rtc_tm_to_time);
 
-/*
- * Convert rtc_time to ktime
- */
-ktime_t rtc_tm_to_ktime(struct rtc_time tm)
-{
-	time_t time;
-	rtc_tm_to_time(&tm, &time);
-	return ktime_set(time, 0);
-}
-EXPORT_SYMBOL_GPL(rtc_tm_to_ktime);
-
-/*
- * Convert ktime to rtc_time
- */
-struct rtc_time rtc_ktime_to_tm(ktime_t kt)
-{
-	struct timespec ts;
-	struct rtc_time ret;
-
-	ts = ktime_to_timespec(kt);
-	/* Round up any ns */
-	if (ts.tv_nsec)
-		ts.tv_sec++;
-	rtc_time_to_tm(ts.tv_sec, &ret);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(rtc_ktime_to_tm);
-
 MODULE_LICENSE("GPL");
diff --git a/trunk/drivers/watchdog/hpwdt.c b/trunk/drivers/watchdog/hpwdt.c
index 24b966d5061a..dea7b5bf6e2c 100644
--- a/trunk/drivers/watchdog/hpwdt.c
+++ b/trunk/drivers/watchdog/hpwdt.c
@@ -469,7 +469,7 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason,
 	unsigned long rom_pl;
 	static int die_nmi_called;
 
-	if (ulReason != DIE_NMIUNKNOWN)
+	if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI)
 		goto out;
 
 	if (!hpwdt_nmi_decoding)
diff --git a/trunk/fs/9p/acl.c b/trunk/fs/9p/acl.c
index 6e58c4ca1e6e..c9da2640f6f1 100644
--- a/trunk/fs/9p/acl.c
+++ b/trunk/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
 	ssize_t size;
 	void *value = NULL;
-	struct posix_acl *acl = NULL;;
+	struct posix_acl *acl = NULL;
 
 	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
 	if (size > 0) {
diff --git a/trunk/fs/9p/xattr.c b/trunk/fs/9p/xattr.c
index 43ec7df84336..d288773871b3 100644
--- a/trunk/fs/9p/xattr.c
+++ b/trunk/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 			"p9_client_xattrcreate failed %d\n", retval);
 		goto error;
 	}
-	msize = fid->clnt->msize;;
+	msize = fid->clnt->msize;
 	while (value_len) {
 		if (value_len > (msize - P9_IOHDRSZ))
 			write_count = msize - P9_IOHDRSZ;
diff --git a/trunk/fs/ocfs2/Kconfig b/trunk/fs/ocfs2/Kconfig
index ab152c00cd3a..0d840669698e 100644
--- a/trunk/fs/ocfs2/Kconfig
+++ b/trunk/fs/ocfs2/Kconfig
@@ -51,7 +51,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 
 config OCFS2_FS_STATS
 	bool "OCFS2 statistics"
-	depends on OCFS2_FS && DEBUG_FS
+	depends on OCFS2_FS
 	default y
 	help
 	  This option allows some fs statistics to be captured. Enabling
diff --git a/trunk/fs/ocfs2/alloc.c b/trunk/fs/ocfs2/alloc.c
index e4984e259cb6..592fae5007d1 100644
--- a/trunk/fs/ocfs2/alloc.c
+++ b/trunk/fs/ocfs2/alloc.c
@@ -565,6 +565,7 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
 	return ret;
 }
 
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 					 struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5857,7 +5858,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 
 	ocfs2_journal_dirty(handle, tl_bh);
 
-	osb->truncated_clusters += num_clusters;
 bail:
 	mlog_exit(status);
 	return status;
@@ -5929,8 +5929,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 		i--;
 	}
 
-	osb->truncated_clusters = 0;
-
 bail:
 	mlog_exit(status);
 	return status;
@@ -7140,6 +7138,64 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	return status;
 }
 
+/*
+ * Expects the inode to already be locked.
+ */
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc)
+{
+	int status;
+	unsigned int new_i_clusters;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *last_eb_bh = NULL;
+
+	mlog_entry_void();
+
+	*tc = NULL;
+
+	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						  i_size_read(inode));
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
+	     (unsigned long long)le64_to_cpu(fe->i_size));
+
+	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
+	if (!(*tc)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
+
+	if (fe->id2.i_list.l_tree_depth) {
+		status = ocfs2_read_extent_block(INODE_CACHE(inode),
+						 le64_to_cpu(fe->i_last_eb_blk),
+						 &last_eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	}
+
+	(*tc)->tc_last_eb_bh = last_eb_bh;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		if (*tc)
+			ocfs2_free_truncate_context(*tc);
+		*tc = NULL;
+	}
+	mlog_exit_void();
+	return status;
+}
+
 /*
  * 'start' is inclusive, 'end' is not.
  */
@@ -7214,3 +7270,18 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 out:
 	return ret;
 }
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+{
+	/*
+	 * The caller is responsible for completing deallocation
+	 * before freeing the context.
+	 */
+	if (tc->tc_dealloc.c_first_suballocator != NULL)
+		mlog(ML_NOTICE,
+		     "Truncate completion has non-empty dealloc context\n");
+
+	brelse(tc->tc_last_eb_bh);
+
+	kfree(tc);
+}
diff --git a/trunk/fs/ocfs2/alloc.h b/trunk/fs/ocfs2/alloc.h
index 3bd08a03251c..55762b554b99 100644
--- a/trunk/fs/ocfs2/alloc.h
+++ b/trunk/fs/ocfs2/alloc.h
@@ -228,6 +228,10 @@ struct ocfs2_truncate_context {
 
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 				  u64 range_start, u64 range_end);
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct inode *inode,
 			  struct buffer_head *di_bh);
diff --git a/trunk/fs/ocfs2/aops.c b/trunk/fs/ocfs2/aops.c
index 1fbb0e20131b..0d7c5540ad66 100644
--- a/trunk/fs/ocfs2/aops.c
+++ b/trunk/fs/ocfs2/aops.c
@@ -1630,43 +1630,6 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
 	return ret;
 }
 
-/*
- * Try to flush truncate logs if we can free enough clusters from it.
- * As for return value, "< 0" means error, "0" no space and "1" means
- * we have freed enough spaces and let the caller try to allocate again.
- */
-static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
-					  unsigned int needed)
-{
-	tid_t target;
-	int ret = 0;
-	unsigned int truncated_clusters;
-
-	mutex_lock(&osb->osb_tl_inode->i_mutex);
-	truncated_clusters = osb->truncated_clusters;
-	mutex_unlock(&osb->osb_tl_inode->i_mutex);
-
-	/*
-	 * Check whether we can succeed in allocating if we free
-	 * the truncate log.
-	 */
-	if (truncated_clusters < needed)
-		goto out;
-
-	ret = ocfs2_flush_truncate_log(osb);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
-		jbd2_log_wait_commit(osb->journal->j_journal, target);
-		ret = 1;
-	}
-out:
-	return ret;
-}
-
 int ocfs2_write_begin_nolock(struct file *filp,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
@@ -1674,7 +1637,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
 	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
+	unsigned int clusters_to_alloc, extents_to_split;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1683,9 +1646,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
-	int try_free = 1, ret1;
 
-try_again:
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
 		mlog_errno(ret);
@@ -1720,7 +1681,6 @@ int ocfs2_write_begin_nolock(struct file *filp,
 		mlog_errno(ret);
 		goto out;
 	} else if (ret == 1) {
-		clusters_need = wc->w_clen;
 		ret = ocfs2_refcount_cow(inode, filp, di_bh,
 					 wc->w_cpos, wc->w_clen, UINT_MAX);
 		if (ret) {
@@ -1735,7 +1695,6 @@ int ocfs2_write_begin_nolock(struct file *filp,
 		mlog_errno(ret);
 		goto out;
 	}
-	clusters_need += clusters_to_alloc;
 
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
@@ -1858,22 +1817,6 @@ int ocfs2_write_begin_nolock(struct file *filp,
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
-
-	if (ret == -ENOSPC && try_free) {
-		/*
-		 * Try to free some truncate log so that we can have enough
-		 * clusters to allocate.
-		 */
-		try_free = 0;
-
-		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
-		if (ret1 == 1)
-			goto try_again;
-
-		if (ret1 < 0)
-			mlog_errno(ret1);
-	}
-
 	return ret;
 }
 
diff --git a/trunk/fs/ocfs2/cluster/heartbeat.c b/trunk/fs/ocfs2/cluster/heartbeat.c
index a6cc05302e9f..9e3d45bcb5fd 100644
--- a/trunk/fs/ocfs2/cluster/heartbeat.c
+++ b/trunk/fs/ocfs2/cluster/heartbeat.c
@@ -82,7 +82,6 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES	4
 #define O2HB_DB_TYPE_REGION_NUMBER	5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME	6
-#define O2HB_DB_TYPE_REGION_PINNED	7
 struct o2hb_debug_buf {
 	int db_type;
 	int db_size;
@@ -102,7 +101,6 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER	"num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME	"elapsed_time_in_ms"
-#define O2HB_DEBUG_REGION_PINNED	"pinned"
 
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -134,33 +132,6 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
 
-/*
- * o2hb_dependent_users tracks the number of registered callbacks that depend
- * on heartbeat. o2net and o2dlm are two entities that register this callback.
- * However only o2dlm depends on the heartbeat. It does not want the heartbeat
- * to stop while a dlm domain is still active.
- */
-unsigned int o2hb_dependent_users;
-
-/*
- * In global heartbeat mode, all regions are pinned if there are one or more
- * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
- * regions are unpinned if the region count exceeds the cut off or the number
- * of dependent users falls to zero.
- */
-#define O2HB_PIN_CUT_OFF		3
-
-/*
- * In local heartbeat mode, we assume the dlm domain name to be the same as
- * region uuid. This is true for domains created for the file system but not
- * necessarily true for userdlm domains. This is a known limitation.
- *
- * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
- * works for both file system and userdlm domains.
- */
-static int o2hb_region_pin(const char *region_uuid);
-static void o2hb_region_unpin(const char *region_uuid);
-
 /* Only sets a new threshold if there are no active regions.
  *
  * No locking or otherwise interesting code is required for reading
@@ -215,9 +186,7 @@ struct o2hb_region {
 	struct config_item	hr_item;
 
 	struct list_head	hr_all_item;
-	unsigned		hr_unclean_stop:1,
-				hr_item_pinned:1,
-				hr_item_dropped:1;
+	unsigned		hr_unclean_stop:1;
 
 	/* protected by the hr_callback_sem */
 	struct task_struct 	*hr_task;
@@ -243,11 +212,9 @@ struct o2hb_region {
 	struct dentry		*hr_debug_livenodes;
 	struct dentry		*hr_debug_regnum;
 	struct dentry		*hr_debug_elapsed_time;
-	struct dentry		*hr_debug_pinned;
 	struct o2hb_debug_buf	*hr_db_livenodes;
 	struct o2hb_debug_buf	*hr_db_regnum;
 	struct o2hb_debug_buf	*hr_db_elapsed_time;
-	struct o2hb_debug_buf	*hr_db_pinned;
 
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
@@ -734,14 +701,6 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
 	       config_item_name(&reg->hr_item));
 
 	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
-
-	/*
-	 * If global heartbeat active, unpin all regions if the
-	 * region count > CUT_OFF
-	 */
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
-			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
-		o2hb_region_unpin(NULL);
 }
 
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1082,9 +1041,6 @@ static int o2hb_thread(void *data)
 
 	set_user_nice(current, -20);
 
-	/* Pin node */
-	o2nm_depend_this_node();
-
 	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
 		/* We track the time spent inside
 		 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1134,9 +1090,6 @@ static int o2hb_thread(void *data)
 		mlog_errno(ret);
 	}
 
-	/* Unpin node */
-	o2nm_undepend_this_node();
-
 	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
 
 	return 0;
@@ -1189,12 +1142,6 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
 						 reg->hr_last_timeout_start));
 		goto done;
 
-	case O2HB_DB_TYPE_REGION_PINNED:
-		reg = (struct o2hb_region *)db->db_data;
-		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
-				!!reg->hr_item_pinned);
-		goto done;
-
 	default:
 		goto done;
 	}
@@ -1368,8 +1315,6 @@ int o2hb_init(void)
 	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
 	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
 
-	o2hb_dependent_users = 0;
-
 	return o2hb_debug_init();
 }
 
@@ -1439,7 +1384,6 @@ static void o2hb_region_release(struct config_item *item)
 	debugfs_remove(reg->hr_debug_livenodes);
 	debugfs_remove(reg->hr_debug_regnum);
 	debugfs_remove(reg->hr_debug_elapsed_time);
-	debugfs_remove(reg->hr_debug_pinned);
 	debugfs_remove(reg->hr_debug_dir);
 
 	spin_lock(&o2hb_live_lock);
@@ -2004,18 +1948,6 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
 		goto bail;
 	}
 
-	reg->hr_debug_pinned =
-			o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_pinned),
-					  sizeof(*(reg->hr_db_pinned)),
-					  O2HB_DB_TYPE_REGION_PINNED,
-					  0, 0, reg);
-	if (!reg->hr_debug_pinned) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
 	ret = 0;
 bail:
 	return ret;
@@ -2070,20 +2002,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
 	struct task_struct *hb_task;
 	struct o2hb_region *reg = to_o2hb_region(item);
-	int quorum_region = 0;
 
 	/* stop the thread when the user removes the region dir */
 	spin_lock(&o2hb_live_lock);
 	if (o2hb_global_heartbeat_active()) {
 		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
 		clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
-		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
-			quorum_region = 1;
-		clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
 	}
 	hb_task = reg->hr_task;
 	reg->hr_task = NULL;
-	reg->hr_item_dropped = 1;
 	spin_unlock(&o2hb_live_lock);
 
 	if (hb_task)
@@ -2101,27 +2028,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 	if (o2hb_global_heartbeat_active())
 		printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
 		       config_item_name(&reg->hr_item));
-
 	config_item_put(item);
-
-	if (!o2hb_global_heartbeat_active() || !quorum_region)
-		return;
-
-	/*
-	 * If global heartbeat active and there are dependent users,
-	 * pin all regions if quorum region count <= CUT_OFF
-	 */
-	spin_lock(&o2hb_live_lock);
-
-	if (!o2hb_dependent_users)
-		goto unlock;
-
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
-			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
-		o2hb_region_pin(NULL);
-
-unlock:
-	spin_unlock(&o2hb_live_lock);
 }
 
 struct o2hb_heartbeat_group_attribute {
@@ -2307,138 +2214,63 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
 
-/*
- * In local heartbeat mode, region_uuid passed matches the dlm domain name.
- * In global heartbeat mode, region_uuid passed is NULL.
- *
- * In local, we only pin the matching region. In global we pin all the active
- * regions.
- */
-static int o2hb_region_pin(const char *region_uuid)
+static struct o2hb_region *o2hb_find_region(const char *region_uuid)
 {
-	int ret = 0, found = 0;
-	struct o2hb_region *reg;
-	char *uuid;
+	struct o2hb_region *p, *reg = NULL;
 
 	assert_spin_locked(&o2hb_live_lock);
 
-	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-		uuid = config_item_name(&reg->hr_item);
-
-		/* local heartbeat */
-		if (region_uuid) {
-			if (strcmp(region_uuid, uuid))
-				continue;
-			found = 1;
-		}
-
-		if (reg->hr_item_pinned || reg->hr_item_dropped)
-			goto skip_pin;
-
-		/* Ignore ENOENT only for local hb (userdlm domain) */
-		ret = o2nm_depend_item(&reg->hr_item);
-		if (!ret) {
-			mlog(ML_CLUSTER, "Pin region %s\n", uuid);
-			reg->hr_item_pinned = 1;
-		} else {
-			if (ret == -ENOENT && found)
-				ret = 0;
-			else {
-				mlog(ML_ERROR, "Pin region %s fails with %d\n",
-				     uuid, ret);
-				break;
-			}
-		}
-skip_pin:
-		if (found)
+	list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+		if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+			reg = p;
 			break;
-	}
-
-	return ret;
-}
-
-/*
- * In local heartbeat mode, region_uuid passed matches the dlm domain name.
- * In global heartbeat mode, region_uuid passed is NULL.
- *
- * In local, we only unpin the matching region. In global we unpin all the
- * active regions.
- */
-static void o2hb_region_unpin(const char *region_uuid)
-{
-	struct o2hb_region *reg;
-	char *uuid;
-	int found = 0;
-
-	assert_spin_locked(&o2hb_live_lock);
-
-	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-		uuid = config_item_name(&reg->hr_item);
-		if (region_uuid) {
-			if (strcmp(region_uuid, uuid))
-				continue;
-			found = 1;
 		}
-
-		if (reg->hr_item_pinned) {
-			mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
-			o2nm_undepend_item(&reg->hr_item);
-			reg->hr_item_pinned = 0;
-		}
-		if (found)
-			break;
 	}
+
+	return reg;
 }
 
-static int o2hb_region_inc_user(const char *region_uuid)
+static int o2hb_region_get(const char *region_uuid)
 {
 	int ret = 0;
+	struct o2hb_region *reg;
 
 	spin_lock(&o2hb_live_lock);
 
-	/* local heartbeat */
-	if (!o2hb_global_heartbeat_active()) {
-	    ret = o2hb_region_pin(region_uuid);
-	    goto unlock;
-	}
+	reg = o2hb_find_region(region_uuid);
+	if (!reg)
+		ret = -ENOENT;
+	spin_unlock(&o2hb_live_lock);
 
-	/*
-	 * if global heartbeat active and this is the first dependent user,
-	 * pin all regions if quorum region count <= CUT_OFF
-	 */
-	o2hb_dependent_users++;
-	if (o2hb_dependent_users > 1)
-		goto unlock;
+	if (ret)
+		goto out;
 
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
-			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
-		ret = o2hb_region_pin(NULL);
+	ret = o2nm_depend_this_node();
+	if (ret)
+		goto out;
 
-unlock:
-	spin_unlock(&o2hb_live_lock);
+	ret = o2nm_depend_item(&reg->hr_item);
+	if (ret)
+		o2nm_undepend_this_node();
+
+out:
 	return ret;
 }
 
-void o2hb_region_dec_user(const char *region_uuid)
+static void o2hb_region_put(const char *region_uuid)
 {
-	spin_lock(&o2hb_live_lock);
+	struct o2hb_region *reg;
 
-	/* local heartbeat */
-	if (!o2hb_global_heartbeat_active()) {
-	    o2hb_region_unpin(region_uuid);
-	    goto unlock;
-	}
+	spin_lock(&o2hb_live_lock);
 
-	/*
-	 * if global heartbeat active and there are no dependent users,
-	 * unpin all quorum regions
-	 */
-	o2hb_dependent_users--;
-	if (!o2hb_dependent_users)
-		o2hb_region_unpin(NULL);
+	reg = o2hb_find_region(region_uuid);
 
-unlock:
 	spin_unlock(&o2hb_live_lock);
+
+	if (reg) {
+		o2nm_undepend_item(&reg->hr_item);
+		o2nm_undepend_this_node();
+	}
 }
 
 int o2hb_register_callback(const char *region_uuid,
@@ -2459,11 +2291,9 @@ int o2hb_register_callback(const char *region_uuid,
 	}
 
 	if (region_uuid) {
-		ret = o2hb_region_inc_user(region_uuid);
-		if (ret) {
-			mlog_errno(ret);
+		ret = o2hb_region_get(region_uuid);
+		if (ret)
 			goto out;
-		}
 	}
 
 	down_write(&o2hb_callback_sem);
@@ -2481,7 +2311,7 @@ int o2hb_register_callback(const char *region_uuid,
 	up_write(&o2hb_callback_sem);
 	ret = 0;
 out:
-	mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
+	mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
 	     ret, __builtin_return_address(0), hc);
 	return ret;
 }
@@ -2492,7 +2322,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
 	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
 
-	mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
+	mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
 	     __builtin_return_address(0), hc);
 
 	/* XXX Can this happen _with_ a region reference? */
@@ -2500,7 +2330,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 		return;
 
 	if (region_uuid)
-		o2hb_region_dec_user(region_uuid);
+		o2hb_region_put(region_uuid);
 
 	down_write(&o2hb_callback_sem);
 
diff --git a/trunk/fs/ocfs2/cluster/netdebug.c b/trunk/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3d..a3f150e52b02 100644
--- a/trunk/fs/ocfs2/cluster/netdebug.c
+++ b/trunk/fs/ocfs2/cluster/netdebug.c
@@ -46,15 +46,10 @@
 #define O2NET_DEBUG_DIR		"o2net"
 #define SC_DEBUG_NAME		"sock_containers"
 #define NST_DEBUG_NAME		"send_tracking"
-#define STATS_DEBUG_NAME	"stats"
-
-#define SHOW_SOCK_CONTAINERS	0
-#define SHOW_SOCK_STATS		1
 
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
-static struct dentry *stats_dentry;
 
 static DEFINE_SPINLOCK(o2net_debug_lock);
 
@@ -128,42 +123,37 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
 	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
-	ktime_t now;
-	s64 sock, send, status;
 
 	spin_lock(&o2net_debug_lock);
 	nst = next_nst(dummy_nst);
-	if (!nst)
-		goto out;
 
-	now = ktime_get();
-	sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
-	send = ktime_to_us(ktime_sub(now, nst->st_send_time));
-	status = ktime_to_us(ktime_sub(now, nst->st_status_time));
-
-	/* get_task_comm isn't exported.  oh well. */
-	seq_printf(seq, "%p:\n"
-		   "  pid:          %lu\n"
-		   "  tgid:         %lu\n"
-		   "  process name: %s\n"
-		   "  node:         %u\n"
-		   "  sc:           %p\n"
-		   "  message id:   %d\n"
-		   "  message type: %u\n"
-		   "  message key:  0x%08x\n"
-		   "  sock acquiry: %lld usecs ago\n"
-		   "  send start:   %lld usecs ago\n"
-		   "  wait start:   %lld usecs ago\n",
-		   nst, (unsigned long)task_pid_nr(nst->st_task),
-		   (unsigned long)nst->st_task->tgid,
-		   nst->st_task->comm, nst->st_node,
-		   nst->st_sc, nst->st_id, nst->st_msg_type,
-		   nst->st_msg_key,
-		   (long long)sock,
-		   (long long)send,
-		   (long long)status);
+	if (nst != NULL) {
+		/* get_task_comm isn't exported.  oh well. */
+		seq_printf(seq, "%p:\n"
+			   "  pid:          %lu\n"
+			   "  tgid:         %lu\n"
+			   "  process name: %s\n"
+			   "  node:         %u\n"
+			   "  sc:           %p\n"
+			   "  message id:   %d\n"
+			   "  message type: %u\n"
+			   "  message key:  0x%08x\n"
+			   "  sock acquiry: %lu.%ld\n"
+			   "  send start:   %lu.%ld\n"
+			   "  wait start:   %lu.%ld\n",
+			   nst, (unsigned long)nst->st_task->pid,
+			   (unsigned long)nst->st_task->tgid,
+			   nst->st_task->comm, nst->st_node,
+			   nst->st_sc, nst->st_id, nst->st_msg_type,
+			   nst->st_msg_key,
+			   nst->st_sock_time.tv_sec,
+			   (long)nst->st_sock_time.tv_usec,
+			   nst->st_send_time.tv_sec,
+			   (long)nst->st_send_time.tv_usec,
+			   nst->st_status_time.tv_sec,
+			   (long)nst->st_status_time.tv_usec);
+	}
 
-out:
 	spin_unlock(&o2net_debug_lock);
 
 	return 0;
@@ -238,11 +228,6 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
 	spin_unlock(&o2net_debug_lock);
 }
 
-struct o2net_sock_debug {
-	int dbg_ctxt;
-	struct o2net_sock_container *dbg_sock;
-};
-
 static struct o2net_sock_container
 			*next_sc(struct o2net_sock_container *sc_start)
 {
@@ -268,8 +253,7 @@ static struct o2net_sock_container
 
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct o2net_sock_debug *sd = seq->private;
-	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+	struct o2net_sock_container *sc, *dummy_sc = seq->private;
 
 	spin_lock(&o2net_debug_lock);
 	sc = next_sc(dummy_sc);
@@ -280,8 +264,7 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct o2net_sock_debug *sd = seq->private;
-	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+	struct o2net_sock_container *sc, *dummy_sc = seq->private;
 
 	spin_lock(&o2net_debug_lock);
 	sc = next_sc(dummy_sc);
@@ -293,107 +276,65 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return sc; /* unused, just needs to be null when done */
 }
 
-#ifdef CONFIG_OCFS2_FS_STATS
-# define sc_send_count(_s)		((_s)->sc_send_count)
-# define sc_recv_count(_s)		((_s)->sc_recv_count)
-# define sc_tv_acquiry_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_acquiry_total))
-# define sc_tv_send_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_send_total))
-# define sc_tv_status_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_status_total))
-# define sc_tv_process_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_process_total))
-#else
-# define sc_send_count(_s)		(0U)
-# define sc_recv_count(_s)		(0U)
-# define sc_tv_acquiry_total_ns(_s)	(0LL)
-# define sc_tv_send_total_ns(_s)	(0LL)
-# define sc_tv_status_total_ns(_s)	(0LL)
-# define sc_tv_process_total_ns(_s)	(0LL)
-#endif
-
-/* So that debugfs.ocfs2 can determine which format is being used */
-#define O2NET_STATS_STR_VERSION		1
-static void sc_show_sock_stats(struct seq_file *seq,
-			       struct o2net_sock_container *sc)
-{
-	if (!sc)
-		return;
-
-	seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
-		   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
-		   (long long)sc_tv_acquiry_total_ns(sc),
-		   (long long)sc_tv_send_total_ns(sc),
-		   (long long)sc_tv_status_total_ns(sc),
-		   (unsigned long)sc_recv_count(sc),
-		   (long long)sc_tv_process_total_ns(sc));
-}
-
-static void sc_show_sock_container(struct seq_file *seq,
-				   struct o2net_sock_container *sc)
-{
-	struct inet_sock *inet = NULL;
-	__be32 saddr = 0, daddr = 0;
-	__be16 sport = 0, dport = 0;
-
-	if (!sc)
-		return;
-
-	if (sc->sc_sock) {
-		inet = inet_sk(sc->sc_sock->sk);
-		/* the stack's structs aren't sparse endian clean */
-		saddr = (__force __be32)inet->inet_saddr;
-		daddr = (__force __be32)inet->inet_daddr;
-		sport = (__force __be16)inet->inet_sport;
-		dport = (__force __be16)inet->inet_dport;
-	}
-
-	/* XXX sigh, inet-> doesn't have sparse annotation so any
-	 * use of it here generates a warning with -Wbitwise */
-	seq_printf(seq, "%p:\n"
-		   "  krefs:           %d\n"
-		   "  sock:            %pI4:%u -> "
-				      "%pI4:%u\n"
-		   "  remote node:     %s\n"
-		   "  page off:        %zu\n"
-		   "  handshake ok:    %u\n"
-		   "  timer:           %lld usecs\n"
-		   "  data ready:      %lld usecs\n"
-		   "  advance start:   %lld usecs\n"
-		   "  advance stop:    %lld usecs\n"
-		   "  func start:      %lld usecs\n"
-		   "  func stop:       %lld usecs\n"
-		   "  func key:        0x%08x\n"
-		   "  func type:       %u\n",
-		   sc,
-		   atomic_read(&sc->sc_kref.refcount),
-		   &saddr, inet ? ntohs(sport) : 0,
-		   &daddr, inet ? ntohs(dport) : 0,
-		   sc->sc_node->nd_name,
-		   sc->sc_page_off,
-		   sc->sc_handshake_ok,
-		   (long long)ktime_to_us(sc->sc_tv_timer),
-		   (long long)ktime_to_us(sc->sc_tv_data_ready),
-		   (long long)ktime_to_us(sc->sc_tv_advance_start),
-		   (long long)ktime_to_us(sc->sc_tv_advance_stop),
-		   (long long)ktime_to_us(sc->sc_tv_func_start),
-		   (long long)ktime_to_us(sc->sc_tv_func_stop),
-		   sc->sc_msg_key,
-		   sc->sc_msg_type);
-}
+#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
 
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-	struct o2net_sock_debug *sd = seq->private;
-	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+	struct o2net_sock_container *sc, *dummy_sc = seq->private;
 
 	spin_lock(&o2net_debug_lock);
 	sc = next_sc(dummy_sc);
 
-	if (sc) {
-		if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
-			sc_show_sock_container(seq, sc);
-		else
-			sc_show_sock_stats(seq, sc);
+	if (sc != NULL) {
+		struct inet_sock *inet = NULL;
+
+		__be32 saddr = 0, daddr = 0;
+		__be16 sport = 0, dport = 0;
+
+		if (sc->sc_sock) {
+			inet = inet_sk(sc->sc_sock->sk);
+			/* the stack's structs aren't sparse endian clean */
+			saddr = (__force __be32)inet->inet_saddr;
+			daddr = (__force __be32)inet->inet_daddr;
+			sport = (__force __be16)inet->inet_sport;
+			dport = (__force __be16)inet->inet_dport;
+		}
+
+		/* XXX sigh, inet-> doesn't have sparse annotation so any
+		 * use of it here generates a warning with -Wbitwise */
+		seq_printf(seq, "%p:\n"
+			   "  krefs:           %d\n"
+			   "  sock:            %pI4:%u -> "
+					      "%pI4:%u\n"
+			   "  remote node:     %s\n"
+			   "  page off:        %zu\n"
+			   "  handshake ok:    %u\n"
+			   "  timer:           %lu.%ld\n"
+			   "  data ready:      %lu.%ld\n"
+			   "  advance start:   %lu.%ld\n"
+			   "  advance stop:    %lu.%ld\n"
+			   "  func start:      %lu.%ld\n"
+			   "  func stop:       %lu.%ld\n"
+			   "  func key:        %u\n"
+			   "  func type:       %u\n",
+			   sc,
+			   atomic_read(&sc->sc_kref.refcount),
+			   &saddr, inet ? ntohs(sport) : 0,
+			   &daddr, inet ? ntohs(dport) : 0,
+			   sc->sc_node->nd_name,
+			   sc->sc_page_off,
+			   sc->sc_handshake_ok,
+			   TV_SEC_USEC(sc->sc_tv_timer),
+			   TV_SEC_USEC(sc->sc_tv_data_ready),
+			   TV_SEC_USEC(sc->sc_tv_advance_start),
+			   TV_SEC_USEC(sc->sc_tv_advance_stop),
+			   TV_SEC_USEC(sc->sc_tv_func_start),
+			   TV_SEC_USEC(sc->sc_tv_func_stop),
+			   sc->sc_msg_key,
+			   sc->sc_msg_type);
 	}
 
+
 	spin_unlock(&o2net_debug_lock);
 
 	return 0;
@@ -410,7 +351,7 @@ static const struct seq_operations sc_seq_ops = {
 	.show = sc_seq_show,
 };
 
-static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+static int sc_fop_open(struct inode *inode, struct file *file)
 {
 	struct o2net_sock_container *dummy_sc;
 	struct seq_file *seq;
@@ -428,8 +369,7 @@ static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 		goto out;
 
 	seq = file->private_data;
-	seq->private = sd;
-	sd->dbg_sock = dummy_sc;
+	seq->private = dummy_sc;
 	o2net_debug_add_sc(dummy_sc);
 
 	dummy_sc = NULL;
@@ -442,48 +382,12 @@ static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
-	struct o2net_sock_debug *sd = seq->private;
-	struct o2net_sock_container *dummy_sc = sd->dbg_sock;
+	struct o2net_sock_container *dummy_sc = seq->private;
 
 	o2net_debug_del_sc(dummy_sc);
 	return seq_release_private(inode, file);
 }
 
-static int stats_fop_open(struct inode *inode, struct file *file)
-{
-	struct o2net_sock_debug *sd;
-
-	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
-	if (sd == NULL)
-		return -ENOMEM;
-
-	sd->dbg_ctxt = SHOW_SOCK_STATS;
-	sd->dbg_sock = NULL;
-
-	return sc_common_open(file, sd);
-}
-
-static const struct file_operations stats_seq_fops = {
-	.open = stats_fop_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = sc_fop_release,
-};
-
-static int sc_fop_open(struct inode *inode, struct file *file)
-{
-	struct o2net_sock_debug *sd;
-
-	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
-	if (sd == NULL)
-		return -ENOMEM;
-
-	sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
-	sd->dbg_sock = NULL;
-
-	return sc_common_open(file, sd);
-}
-
 static const struct file_operations sc_seq_fops = {
 	.open = sc_fop_open,
 	.read = seq_read,
@@ -515,29 +419,25 @@ int o2net_debugfs_init(void)
 		goto bail;
 	}
 
-	stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
-					   o2net_dentry, NULL,
-					   &stats_seq_fops);
-	if (!stats_dentry) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
-
 	return 0;
 bail:
-	debugfs_remove(stats_dentry);
-	debugfs_remove(sc_dentry);
-	debugfs_remove(nst_dentry);
-	debugfs_remove(o2net_dentry);
+	if (sc_dentry)
+		debugfs_remove(sc_dentry);
+	if (nst_dentry)
+		debugfs_remove(nst_dentry);
+	if (o2net_dentry)
+		debugfs_remove(o2net_dentry);
 	return -ENOMEM;
 }
 
 void o2net_debugfs_exit(void)
 {
-	debugfs_remove(stats_dentry);
-	debugfs_remove(sc_dentry);
-	debugfs_remove(nst_dentry);
-	debugfs_remove(o2net_dentry);
+	if (sc_dentry)
+		debugfs_remove(sc_dentry);
+	if (nst_dentry)
+		debugfs_remove(nst_dentry);
+	if (o2net_dentry)
+		debugfs_remove(o2net_dentry);
 }
 
 #endif	/* CONFIG_DEBUG_FS */
diff --git a/trunk/fs/ocfs2/cluster/tcp.c b/trunk/fs/ocfs2/cluster/tcp.c
index 3b11cb1e38fc..9aa426e42123 100644
--- a/trunk/fs/ocfs2/cluster/tcp.c
+++ b/trunk/fs/ocfs2/cluster/tcp.c
@@ -153,113 +153,62 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
 	nst->st_node = node;
 }
 
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-	nst->st_sock_time = ktime_get();
+	do_gettimeofday(&nst->st_sock_time);
 }
 
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-	nst->st_send_time = ktime_get();
+	do_gettimeofday(&nst->st_send_time);
 }
 
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-	nst->st_status_time = ktime_get();
+	do_gettimeofday(&nst->st_status_time);
 }
 
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-						struct o2net_sock_container *sc)
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+					 struct o2net_sock_container *sc)
 {
 	nst->st_sc = sc;
 }
 
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
-					u32 msg_id)
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
 	nst->st_id = msg_id;
 }
 
-static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
-{
-	sc->sc_tv_timer = ktime_get();
-}
-
-static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
-{
-	sc->sc_tv_data_ready = ktime_get();
-}
+#else  /* CONFIG_DEBUG_FS */
 
-static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+				  u32 msgkey, struct task_struct *task, u8 node)
 {
-	sc->sc_tv_advance_start = ktime_get();
 }
 
-static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-	sc->sc_tv_advance_stop = ktime_get();
 }
 
-static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-	sc->sc_tv_func_start = ktime_get();
 }
 
-static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-	sc->sc_tv_func_stop = ktime_get();
 }
 
-static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
-{
-	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
-}
-#else  /* CONFIG_DEBUG_FS */
-# define o2net_init_nst(a, b, c, d, e)
-# define o2net_set_nst_sock_time(a)
-# define o2net_set_nst_send_time(a)
-# define o2net_set_nst_status_time(a)
-# define o2net_set_nst_sock_container(a, b)
-# define o2net_set_nst_msg_id(a, b)
-# define o2net_set_sock_timer(a)
-# define o2net_set_data_ready_time(a)
-# define o2net_set_advance_start_time(a)
-# define o2net_set_advance_stop_time(a)
-# define o2net_set_func_start_time(a)
-# define o2net_set_func_stop_time(a)
-# define o2net_get_func_run_time(a)		(ktime_t)0
-#endif /* CONFIG_DEBUG_FS */
-
-#ifdef CONFIG_OCFS2_FS_STATS
-static void o2net_update_send_stats(struct o2net_send_tracking *nst,
-				    struct o2net_sock_container *sc)
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
 {
-	sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
-					   ktime_sub(ktime_get(),
-						     nst->st_status_time));
-	sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
-					 ktime_sub(nst->st_status_time,
-						   nst->st_send_time));
-	sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
-					    ktime_sub(nst->st_send_time,
-						      nst->st_sock_time));
-	sc->sc_send_count++;
 }
 
-static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
 {
-	sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
-					    o2net_get_func_run_time(sc));
-	sc->sc_recv_count++;
 }
 
-#else
-
-# define o2net_update_send_stats(a, b)
-
-# define o2net_update_recv_stats(sc)
-
-#endif /* CONFIG_OCFS2_FS_STATS */
+#endif /* CONFIG_DEBUG_FS */
 
 static inline int o2net_reconnect_delay(void)
 {
@@ -406,7 +355,6 @@ static void sc_kref_release(struct kref *kref)
 		sc->sc_sock = NULL;
 	}
 
-	o2nm_undepend_item(&sc->sc_node->nd_item);
 	o2nm_node_put(sc->sc_node);
 	sc->sc_node = NULL;
 
@@ -428,7 +376,6 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
 	struct o2net_sock_container *sc, *ret = NULL;
 	struct page *page = NULL;
-	int status = 0;
 
 	page = alloc_page(GFP_NOFS);
 	sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -439,13 +386,6 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	o2nm_node_get(node);
 	sc->sc_node = node;
 
-	/* pin the node item of the remote node */
-	status = o2nm_depend_item(&node->nd_item);
-	if (status) {
-		mlog_errno(status);
-		o2nm_node_put(node);
-		goto out;
-	}
 	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
 	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -606,7 +546,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
 	if (sk->sk_user_data) {
 		struct o2net_sock_container *sc = sk->sk_user_data;
 		sclog(sc, "data_ready hit\n");
-		o2net_set_data_ready_time(sc);
+		do_gettimeofday(&sc->sc_tv_data_ready);
 		o2net_sc_queue_work(sc, &sc->sc_rx_work);
 		ready = sc->sc_data_ready;
 	} else {
@@ -1130,8 +1070,6 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 	o2net_set_nst_status_time(&nst);
 	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
 
-	o2net_update_send_stats(&nst, sc);
-
 	/* Note that we avoid overwriting the callers status return
 	 * variable if a system error was reported on the other
 	 * side. Callers beware. */
@@ -1245,15 +1183,13 @@ static int o2net_process_message(struct o2net_sock_container *sc,
 	if (syserr != O2NET_ERR_NONE)
 		goto out_respond;
 
-	o2net_set_func_start_time(sc);
+	do_gettimeofday(&sc->sc_tv_func_start);
 	sc->sc_msg_key = be32_to_cpu(hdr->key);
 	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
 	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
 					     be16_to_cpu(hdr->data_len),
 					nmh->nh_func_data, &ret_data);
-	o2net_set_func_stop_time(sc);
-
-	o2net_update_recv_stats(sc);
+	do_gettimeofday(&sc->sc_tv_func_stop);
 
 out_respond:
 	/* this destroys the hdr, so don't use it after this */
@@ -1364,7 +1300,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 	size_t datalen;
 
 	sclog(sc, "receiving\n");
-	o2net_set_advance_start_time(sc);
+	do_gettimeofday(&sc->sc_tv_advance_start);
 
 	if (unlikely(sc->sc_handshake_ok == 0)) {
 		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1439,7 +1375,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 
 out:
 	sclog(sc, "ret = %d\n", ret);
-	o2net_set_advance_stop_time(sc);
+	do_gettimeofday(&sc->sc_tv_advance_stop);
 	return ret;
 }
 
@@ -1539,28 +1475,27 @@ static void o2net_idle_timer(unsigned long data)
 {
 	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	struct timeval now;
 
-#ifdef CONFIG_DEBUG_FS
-	ktime_t now = ktime_get();
-#endif
+	do_gettimeofday(&now);
 
 	printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
 	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
 		     o2net_idle_timeout() / 1000,
 		     o2net_idle_timeout() % 1000);
-
-#ifdef CONFIG_DEBUG_FS
-	mlog(ML_NOTICE, "Here are some times that might help debug the "
-	     "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-	     "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-	     (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-	     (long long)ktime_to_us(sc->sc_tv_data_ready),
-	     (long long)ktime_to_us(sc->sc_tv_advance_start),
-	     (long long)ktime_to_us(sc->sc_tv_advance_stop),
+	mlog(ML_NOTICE, "here are some times that might help debug the "
+	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+	     sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
+	     now.tv_sec, (long) now.tv_usec,
+	     sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+	     sc->sc_tv_advance_start.tv_sec,
+	     (long) sc->sc_tv_advance_start.tv_usec,
+	     sc->sc_tv_advance_stop.tv_sec,
+	     (long) sc->sc_tv_advance_stop.tv_usec,
 	     sc->sc_msg_key, sc->sc_msg_type,
-	     (long long)ktime_to_us(sc->sc_tv_func_start),
-	     (long long)ktime_to_us(sc->sc_tv_func_stop));
-#endif
+	     sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+	     sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
 
 	/*
 	 * Initialize the nn_timeout so that the next connection attempt
@@ -1576,7 +1511,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
 	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
 		      msecs_to_jiffies(o2net_keepalive_delay()));
-	o2net_set_sock_timer(sc);
+	do_gettimeofday(&sc->sc_tv_timer);
 	mod_timer(&sc->sc_idle_timeout,
 	       jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
diff --git a/trunk/fs/ocfs2/cluster/tcp_internal.h b/trunk/fs/ocfs2/cluster/tcp_internal.h
index 4cbcb65784a3..15fdbdf9eb4b 100644
--- a/trunk/fs/ocfs2/cluster/tcp_internal.h
+++ b/trunk/fs/ocfs2/cluster/tcp_internal.h
@@ -166,27 +166,18 @@ struct o2net_sock_container {
 	/* original handlers for the sockets */
 	void			(*sc_state_change)(struct sock *sk);
 	void			(*sc_data_ready)(struct sock *sk, int bytes);
-
-	u32			sc_msg_key;
-	u16			sc_msg_type;
-
 #ifdef CONFIG_DEBUG_FS
 	struct list_head        sc_net_debug_item;
-	ktime_t			sc_tv_timer;
-	ktime_t			sc_tv_data_ready;
-	ktime_t			sc_tv_advance_start;
-	ktime_t			sc_tv_advance_stop;
-	ktime_t			sc_tv_func_start;
-	ktime_t			sc_tv_func_stop;
-#endif
-#ifdef CONFIG_OCFS2_FS_STATS
-	ktime_t			sc_tv_acquiry_total;
-	ktime_t			sc_tv_send_total;
-	ktime_t			sc_tv_status_total;
-	u32			sc_send_count;
-	u32			sc_recv_count;
-	ktime_t			sc_tv_process_total;
 #endif
+	struct timeval 		sc_tv_timer;
+	struct timeval 		sc_tv_data_ready;
+	struct timeval 		sc_tv_advance_start;
+	struct timeval 		sc_tv_advance_stop;
+	struct timeval 		sc_tv_func_start;
+	struct timeval 		sc_tv_func_stop;
+	u32			sc_msg_key;
+	u16			sc_msg_type;
+
 	struct mutex		sc_send_lock;
 };
 
@@ -229,9 +220,9 @@ struct o2net_send_tracking {
 	u32				st_msg_type;
 	u32				st_msg_key;
 	u8				st_node;
-	ktime_t				st_sock_time;
-	ktime_t				st_send_time;
-	ktime_t				st_status_time;
+	struct timeval			st_sock_time;
+	struct timeval			st_send_time;
+	struct timeval			st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/trunk/fs/ocfs2/dlm/dlmast.c b/trunk/fs/ocfs2/dlm/dlmast.c
index 3a3ed4bb794b..f44999156839 100644
--- a/trunk/fs/ocfs2/dlm/dlmast.c
+++ b/trunk/fs/ocfs2/dlm/dlmast.c
@@ -90,29 +90,19 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-	struct dlm_lock_resource *res;
+	mlog_entry_void();
 
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
 
-	res = lock->lockres;
-
 	assert_spin_locked(&dlm->ast_lock);
-
 	if (!list_empty(&lock->ast_list)) {
-		mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
-		     "AST list not empty, pending %d, newlevel %d\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
 		     lock->ast_pending, lock->ml.type);
 		BUG();
 	}
 	if (lock->ast_pending)
-		mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+		mlog(0, "lock has an ast getting flushed right now\n");
 
 	/* putting lock on list, add a ref */
 	dlm_lock_get(lock);
@@ -120,10 +110,9 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 	/* check to see if this ast obsoletes the bast */
 	if (dlm_should_cancel_bast(dlm, lock)) {
-		mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+		struct dlm_lock_resource *res = lock->lockres;
+		mlog(0, "%s: cancelling bast for %.*s\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
 		lock->bast_pending = 0;
 		list_del_init(&lock->bast_list);
 		lock->ml.highest_blocked = LKM_IVMODE;
@@ -145,6 +134,8 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
+	mlog_entry_void();
+
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
 
@@ -156,21 +147,15 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-	struct dlm_lock_resource *res;
+	mlog_entry_void();
 
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
-
 	assert_spin_locked(&dlm->ast_lock);
 
-	res = lock->lockres;
-
 	BUG_ON(!list_empty(&lock->bast_list));
 	if (lock->bast_pending)
-		mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+		mlog(0, "lock has a bast getting flushed right now\n");
 
 	/* putting lock on list, add a ref */
 	dlm_lock_get(lock);
@@ -182,6 +167,8 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
+	mlog_entry_void();
+
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
 
@@ -226,10 +213,7 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	dlm_astlockfunc_t *fn;
 	struct dlm_lockstatus *lksb;
 
-	mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
-	     res->lockname.len, res->lockname.name,
-	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+	mlog_entry_void();
 
 	lksb = lock->lksb;
 	fn = lock->ast;
@@ -247,10 +231,7 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	struct dlm_lockstatus *lksb;
 	int lksbflags;
 
-	mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
-	     res->lockname.len, res->lockname.name,
-	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+	mlog_entry_void();
 
 	lksb = lock->lksb;
 	BUG_ON(lock->ml.node == dlm->node_num);
@@ -269,14 +250,9 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
 	dlm_bastlockfunc_t *fn = lock->bast;
 
+	mlog_entry_void();
 	BUG_ON(lock->ml.node != dlm->node_num);
 
-	mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
-	     dlm->name, res->lockname.len, res->lockname.name,
-	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-	     blocked_type);
-
 	(*fn)(lock->astdata, blocked_type);
 }
 
@@ -356,8 +332,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	/* cannot get a proxy ast message if this node owns it */
 	BUG_ON(res->owner == dlm->node_num);
 
-	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
-	     res->lockname.name);
+	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
 
 	spin_lock(&res->spinlock);
 	if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -407,12 +382,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
 		list_move_tail(&lock->list, &res->granted);
-		mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-		     lock->ml.type, lock->ml.convert_type);
-
+		mlog(0, "ast: Adding to granted list... type=%d, "
+		     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
 		if (lock->ml.convert_type != LKM_IVMODE) {
 			lock->ml.type = lock->ml.convert_type;
 			lock->ml.convert_type = LKM_IVMODE;
@@ -455,9 +426,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	size_t veclen = 1;
 	int status;
 
-	mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
-	     res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
-	     blocked_type);
+	mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+		   res->lockname.len, res->lockname.name, lock->ml.node,
+		   msg_type, blocked_type);
 
 	memset(&past, 0, sizeof(struct dlm_proxy_ast));
 	past.node_idx = dlm->node_num;
@@ -470,6 +441,7 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
 	vec[0].iov_base = &past;
 	if (flags & DLM_LKSB_GET_LVB) {
+		mlog(0, "returning requested LVB data\n");
 		be32_add_cpu(&past.flags, LKM_GET_LVB);
 		vec[1].iov_len = DLM_LVB_LEN;
 		vec[1].iov_base = lock->lksb->lvb;
@@ -479,8 +451,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
 				     lock->ml.node, &status);
 	if (ret < 0)
-		mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
-		     dlm->name, res->lockname.len, res->lockname.name, ret,
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
 		     lock->ml.node);
 	else {
 		if (status == DLM_RECOVERING) {
diff --git a/trunk/fs/ocfs2/dlm/dlmcommon.h b/trunk/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7baee344..b36d0bf77a5a 100644
--- a/trunk/fs/ocfs2/dlm/dlmcommon.h
+++ b/trunk/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 
 enum dlm_mle_type {
-	DLM_MLE_BLOCK = 0,
-	DLM_MLE_MASTER = 1,
-	DLM_MLE_MIGRATION = 2,
-	DLM_MLE_NUM_TYPES = 3,
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER,
+	DLM_MLE_MIGRATION,
+	DLM_MLE_NUM_TYPES
 };
 
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 
 enum dlm_ast_type {
 	DLM_AST = 0,
-	DLM_BAST = 1,
-	DLM_ASTUNLOCK = 2,
+	DLM_BAST,
+	DLM_ASTUNLOCK
 };
 
 
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 
 enum dlm_ctxt_state {
 	DLM_CTXT_NEW = 0,
-	DLM_CTXT_JOINED = 1,
-	DLM_CTXT_IN_SHUTDOWN = 2,
-	DLM_CTXT_LEAVING = 3,
+	DLM_CTXT_JOINED,
+	DLM_CTXT_IN_SHUTDOWN,
+	DLM_CTXT_LEAVING,
 };
 
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 
 enum dlm_lockres_list {
 	DLM_GRANTED_LIST = 0,
-	DLM_CONVERTING_LIST = 1,
-	DLM_BLOCKED_LIST = 2,
+	DLM_CONVERTING_LIST,
+	DLM_BLOCKED_LIST
 };
 
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
 
 
 enum {
-	DLM_MASTER_REQUEST_MSG		= 500,
-	DLM_UNUSED_MSG1			= 501,
-	DLM_ASSERT_MASTER_MSG		= 502,
-	DLM_CREATE_LOCK_MSG		= 503,
-	DLM_CONVERT_LOCK_MSG		= 504,
-	DLM_PROXY_AST_MSG		= 505,
-	DLM_UNLOCK_LOCK_MSG		= 506,
-	DLM_DEREF_LOCKRES_MSG		= 507,
-	DLM_MIGRATE_REQUEST_MSG		= 508,
-	DLM_MIG_LOCKRES_MSG		= 509,
-	DLM_QUERY_JOIN_MSG		= 510,
-	DLM_ASSERT_JOINED_MSG		= 511,
-	DLM_CANCEL_JOIN_MSG		= 512,
-	DLM_EXIT_DOMAIN_MSG		= 513,
-	DLM_MASTER_REQUERY_MSG		= 514,
-	DLM_LOCK_REQUEST_MSG		= 515,
-	DLM_RECO_DATA_DONE_MSG		= 516,
-	DLM_BEGIN_RECO_MSG		= 517,
-	DLM_FINALIZE_RECO_MSG		= 518,
-	DLM_QUERY_REGION		= 519,
-	DLM_QUERY_NODEINFO		= 520,
+	DLM_MASTER_REQUEST_MSG    = 500,
+	DLM_UNUSED_MSG1,         /* 501 */
+	DLM_ASSERT_MASTER_MSG,	 /* 502 */
+	DLM_CREATE_LOCK_MSG,	 /* 503 */
+	DLM_CONVERT_LOCK_MSG,	 /* 504 */
+	DLM_PROXY_AST_MSG,	 /* 505 */
+	DLM_UNLOCK_LOCK_MSG,	 /* 506 */
+	DLM_DEREF_LOCKRES_MSG,	 /* 507 */
+	DLM_MIGRATE_REQUEST_MSG, /* 508 */
+	DLM_MIG_LOCKRES_MSG, 	 /* 509 */
+	DLM_QUERY_JOIN_MSG,	 /* 510 */
+	DLM_ASSERT_JOINED_MSG,	 /* 511 */
+	DLM_CANCEL_JOIN_MSG,	 /* 512 */
+	DLM_EXIT_DOMAIN_MSG,	 /* 513 */
+	DLM_MASTER_REQUERY_MSG,	 /* 514 */
+	DLM_LOCK_REQUEST_MSG,	 /* 515 */
+	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
+	DLM_BEGIN_RECO_MSG,	 /* 517 */
+	DLM_FINALIZE_RECO_MSG,	 /* 518 */
+	DLM_QUERY_REGION,	 /* 519 */
+	DLM_QUERY_NODEINFO,	 /* 520 */
 };
 
 struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
 enum {
 	DLM_RECO_NODE_DATA_DEAD = -1,
 	DLM_RECO_NODE_DATA_INIT = 0,
-	DLM_RECO_NODE_DATA_REQUESTING = 1,
-	DLM_RECO_NODE_DATA_REQUESTED = 2,
-	DLM_RECO_NODE_DATA_RECEIVING = 3,
-	DLM_RECO_NODE_DATA_DONE = 4,
-	DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
+	DLM_RECO_NODE_DATA_REQUESTING,
+	DLM_RECO_NODE_DATA_REQUESTED,
+	DLM_RECO_NODE_DATA_RECEIVING,
+	DLM_RECO_NODE_DATA_DONE,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT,
 };
 
 
 enum {
 	DLM_MASTER_RESP_NO = 0,
-	DLM_MASTER_RESP_YES = 1,
-	DLM_MASTER_RESP_MAYBE = 2,
-	DLM_MASTER_RESP_ERROR = 3,
+	DLM_MASTER_RESP_YES,
+	DLM_MASTER_RESP_MAYBE,
+	DLM_MASTER_RESP_ERROR
 };
 
 
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
 	JOIN_DISALLOW = 0,
-	JOIN_OK = 1,
-	JOIN_OK_NO_MAP = 2,
-	JOIN_PROTOCOL_MISMATCH = 3,
+	JOIN_OK,
+	JOIN_OK_NO_MAP,
+	JOIN_PROTOCOL_MISMATCH,
 };
 
 struct dlm_query_join_packet {
diff --git a/trunk/fs/ocfs2/dlm/dlmdebug.c b/trunk/fs/ocfs2/dlm/dlmdebug.c
index 04a32be0aeb9..272ec8631a51 100644
--- a/trunk/fs/ocfs2/dlm/dlmdebug.c
+++ b/trunk/fs/ocfs2/dlm/dlmdebug.c
@@ -370,46 +370,92 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
 	kref_get(&dc->debug_refcnt);
 }
 
-static int debug_release(struct inode *inode, struct file *file)
+static struct debug_buffer *debug_buffer_allocate(void)
 {
-	free_page((unsigned long)file->private_data);
-	return 0;
+	struct debug_buffer *db = NULL;
+
+	db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+	if (!db)
+		goto bail;
+
+	db->len = PAGE_SIZE;
+	db->buf = kmalloc(db->len, GFP_KERNEL);
+	if (!db->buf)
+		goto bail;
+
+	return db;
+bail:
+	kfree(db);
+	return NULL;
+}
+
+static ssize_t debug_buffer_read(struct file *file, char __user *buf,
+				 size_t nbytes, loff_t *ppos)
+{
+	struct debug_buffer *db = file->private_data;
+
+	return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
 }
 
-static ssize_t debug_read(struct file *file, char __user *buf,
-			  size_t nbytes, loff_t *ppos)
+static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
 {
-	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
-				       i_size_read(file->f_mapping->host));
+	struct debug_buffer *db = file->private_data;
+	loff_t new = -1;
+
+	switch (whence) {
+	case 0:
+		new = off;
+		break;
+	case 1:
+		new = file->f_pos + off;
+		break;
+	}
+
+	if (new < 0 || new > db->len)
+		return -EINVAL;
+
+	return (file->f_pos = new);
+}
+
+static int debug_buffer_release(struct inode *inode, struct file *file)
+{
+	struct debug_buffer *db = file->private_data;
+
+	if (db)
+		kfree(db->buf);
+	kfree(db);
+
+	return 0;
 }
 /* end - util funcs */
 
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
 	struct dlm_lock_resource *res;
 	int out = 0;
 	unsigned long total = 0;
 
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Dumping Purgelist for Domain: %s\n", dlm->name);
 
 	spin_lock(&dlm->spinlock);
 	list_for_each_entry(res, &dlm->purge_list, purge) {
 		++total;
-		if (len - out < 100)
+		if (db->len - out < 100)
 			continue;
 		spin_lock(&res->spinlock);
 		out += stringify_lockname(res->lockname.name,
 					  res->lockname.len,
-					  buf + out, len - out);
-		out += snprintf(buf + out, len - out, "\t%ld\n",
+					  db->buf + out, db->len - out);
+		out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
 				(jiffies - res->last_used)/HZ);
 		spin_unlock(&res->spinlock);
 	}
 	spin_unlock(&dlm->spinlock);
 
-	out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+	out += snprintf(db->buf + out, db->len - out,
+			"Total on list: %ld\n", total);
 
 	return out;
 }
@@ -417,15 +463,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	char *buf = NULL;
+	struct debug_buffer *db;
 
-	buf = (char *) get_zeroed_page(GFP_NOFS);
-	if (!buf)
+	db = debug_buffer_allocate();
+	if (!db)
 		goto bail;
 
-	i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
+	db->len = debug_purgelist_print(dlm, db);
 
-	file->private_data = buf;
+	file->private_data = db;
 
 	return 0;
 bail:
@@ -434,14 +480,14 @@ static int debug_purgelist_open(struct inode *inode, struct file *file)
 
 static const struct file_operations debug_purgelist_fops = {
 	.open =		debug_purgelist_open,
-	.release =	debug_release,
-	.read =		debug_read,
-	.llseek =	generic_file_llseek,
+	.release =	debug_buffer_release,
+	.read =		debug_buffer_read,
+	.llseek =	debug_buffer_llseek,
 };
 /* end - purge list funcs */
 
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
+static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
 	struct dlm_master_list_entry *mle;
 	struct hlist_head *bucket;
@@ -449,7 +495,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 	int i, out = 0;
 	unsigned long total = 0, longest = 0, bucket_count = 0;
 
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Dumping MLEs for Domain: %s\n", dlm->name);
 
 	spin_lock(&dlm->master_lock);
@@ -460,16 +506,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 					  master_hash_node);
 			++total;
 			++bucket_count;
-			if (len - out < 200)
+			if (db->len - out < 200)
 				continue;
-			out += dump_mle(mle, buf + out, len - out);
+			out += dump_mle(mle, db->buf + out, db->len - out);
 		}
 		longest = max(longest, bucket_count);
 		bucket_count = 0;
 	}
 	spin_unlock(&dlm->master_lock);
 
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Total: %ld, Longest: %ld\n", total, longest);
 	return out;
 }
@@ -477,15 +523,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	char *buf = NULL;
+	struct debug_buffer *db;
 
-	buf = (char *) get_zeroed_page(GFP_NOFS);
-	if (!buf)
+	db = debug_buffer_allocate();
+	if (!db)
 		goto bail;
 
-	i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
+	db->len = debug_mle_print(dlm, db);
 
-	file->private_data = buf;
+	file->private_data = db;
 
 	return 0;
 bail:
@@ -494,9 +540,9 @@ static int debug_mle_open(struct inode *inode, struct file *file)
 
 static const struct file_operations debug_mle_fops = {
 	.open =		debug_mle_open,
-	.release =	debug_release,
-	.read =		debug_read,
-	.llseek =	generic_file_llseek,
+	.release =	debug_buffer_release,
+	.read =		debug_buffer_read,
+	.llseek =	debug_buffer_llseek,
 };
 
 /* end - debug mle funcs */
@@ -711,7 +757,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
+static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
 	int out = 0;
 	struct dlm_reco_node_data *node;
@@ -735,35 +781,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 	}
 
 	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
 			dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
 			dlm->dlm_locking_proto.pv_minor);
 
 	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Thread Pid: %d  Node: %d  State: %s\n",
-			task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
+			dlm->dlm_thread_task->pid, dlm->node_num, state);
 
 	/* Number of Joins: xxx  Joining Node: xxx */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Number of Joins: %d  Joining Node: %d\n",
 			dlm->num_joins, dlm->joining_node);
 
 	/* Domain Map: xx xx xx */
-	out += snprintf(buf + out, len - out, "Domain Map: ");
+	out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
 	out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
+				 db->buf + out, db->len - out);
+	out += snprintf(db->buf + out, db->len - out, "\n");
 
 	/* Live Map: xx xx xx */
-	out += snprintf(buf + out, len - out, "Live Map: ");
+	out += snprintf(db->buf + out, db->len - out, "Live Map: ");
 	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
+				 db->buf + out, db->len - out);
+	out += snprintf(db->buf + out, db->len - out, "\n");
 
 	/* Lock Resources: xxx (xxx) */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Lock Resources: %d (%d)\n",
 			atomic_read(&dlm->res_cur_count),
 			atomic_read(&dlm->res_tot_count));
@@ -775,29 +821,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 		cur_mles += atomic_read(&dlm->mle_cur_count[i]);
 
 	/* MLEs: xxx (xxx) */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"MLEs: %d (%d)\n", cur_mles, tot_mles);
 
 	/*  Blocking: xxx (xxx) */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"  Blocking: %d (%d)\n",
 			atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
 			atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
 
 	/*  Mastery: xxx (xxx) */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"  Mastery: %d (%d)\n",
 			atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
 			atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
 
 	/*  Migration: xxx (xxx) */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"  Migration: %d (%d)\n",
 			atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
 			atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
 
 	/* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
 			"PendingBASTs=%s\n",
 			(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -806,12 +852,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
 
 	/* Purge Count: xxx  Refs: xxx */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
 			atomic_read(&dlm->dlm_refs.refcount));
 
 	/* Dead Node: xxx */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Dead Node: %d\n", dlm->reco.dead_node);
 
 	/* What about DLM_RECO_STATE_FINALIZE? */
@@ -821,19 +867,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 		state = "INACTIVE";
 
 	/* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-	out += snprintf(buf + out, len - out,
+	out += snprintf(db->buf + out, db->len - out,
 			"Recovery Pid: %d  Master: %d  State: %s\n",
-			task_pid_nr(dlm->dlm_reco_thread_task),
+			dlm->dlm_reco_thread_task->pid,
 			dlm->reco.new_master, state);
 
 	/* Recovery Map: xx xx */
-	out += snprintf(buf + out, len - out, "Recovery Map: ");
+	out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
 	out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
+				 db->buf + out, db->len - out);
+	out += snprintf(db->buf + out, db->len - out, "\n");
 
 	/* Recovery Node State: */
-	out += snprintf(buf + out, len - out, "Recovery Node State:\n");
+	out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
 	list_for_each_entry(node, &dlm->reco.node_data, list) {
 		switch (node->state) {
 		case DLM_RECO_NODE_DATA_INIT:
@@ -861,7 +907,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 			state = "BAD";
 			break;
 		}
-		out += snprintf(buf + out, len - out, "\t%u - %s\n",
+		out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
 				node->node_num, state);
 	}
 
@@ -873,15 +919,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	char *buf = NULL;
+	struct debug_buffer *db = NULL;
 
-	buf = (char *) get_zeroed_page(GFP_NOFS);
-	if (!buf)
+	db = debug_buffer_allocate();
+	if (!db)
 		goto bail;
 
-	i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
+	db->len = debug_state_print(dlm, db);
 
-	file->private_data = buf;
+	file->private_data = db;
 
 	return 0;
 bail:
@@ -890,9 +936,9 @@ static int debug_state_open(struct inode *inode, struct file *file)
 
 static const struct file_operations debug_state_fops = {
 	.open =		debug_state_open,
-	.release =	debug_release,
-	.read =		debug_read,
-	.llseek =	generic_file_llseek,
+	.release =	debug_buffer_release,
+	.read =		debug_buffer_read,
+	.llseek =	debug_buffer_llseek,
 };
 /* end  - debug state funcs */
 
@@ -956,10 +1002,14 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
 
 	if (dc) {
-		debugfs_remove(dc->debug_purgelist_dentry);
-		debugfs_remove(dc->debug_mle_dentry);
-		debugfs_remove(dc->debug_lockres_dentry);
-		debugfs_remove(dc->debug_state_dentry);
+		if (dc->debug_purgelist_dentry)
+			debugfs_remove(dc->debug_purgelist_dentry);
+		if (dc->debug_mle_dentry)
+			debugfs_remove(dc->debug_mle_dentry);
+		if (dc->debug_lockres_dentry)
+			debugfs_remove(dc->debug_lockres_dentry);
+		if (dc->debug_state_dentry)
+			debugfs_remove(dc->debug_state_dentry);
 		dlm_debug_put(dc);
 	}
 }
@@ -990,7 +1040,8 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-	debugfs_remove(dlm->dlm_debugfs_subroot);
+	if (dlm->dlm_debugfs_subroot)
+		debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 
 /* debugfs root */
@@ -1006,6 +1057,7 @@ int dlm_create_debugfs_root(void)
 
 void dlm_destroy_debugfs_root(void)
 {
-	debugfs_remove(dlm_debugfs_root);
+	if (dlm_debugfs_root)
+		debugfs_remove(dlm_debugfs_root);
 }
 #endif	/* CONFIG_DEBUG_FS */
diff --git a/trunk/fs/ocfs2/dlm/dlmdebug.h b/trunk/fs/ocfs2/dlm/dlmdebug.h
index 1f27c4812d1a..8c686d22f9c7 100644
--- a/trunk/fs/ocfs2/dlm/dlmdebug.h
+++ b/trunk/fs/ocfs2/dlm/dlmdebug.h
@@ -37,6 +37,11 @@ struct dlm_debug_ctxt {
 	struct dentry *debug_purgelist_dentry;
 };
 
+struct debug_buffer {
+	int len;
+	char *buf;
+};
+
 struct debug_lockres {
 	int dl_len;
 	char *dl_buf;
diff --git a/trunk/fs/ocfs2/dlm/dlmdomain.c b/trunk/fs/ocfs2/dlm/dlmdomain.c
index 7e38a072d720..cc2aaa96cfe5 100644
--- a/trunk/fs/ocfs2/dlm/dlmdomain.c
+++ b/trunk/fs/ocfs2/dlm/dlmdomain.c
@@ -460,6 +460,8 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 		}
 		cond_resched_lock(&dlm->spinlock);
 		num += n;
+		mlog(0, "%s: touched %d lockreses in bucket %d "
+		     "(tot=%d)\n", dlm->name, n, i, num);
 	}
 	spin_unlock(&dlm->spinlock);
 	wake_up(&dlm->dlm_thread_wq);
@@ -1659,8 +1661,8 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
-	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
+	o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+	o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
 	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
 
@@ -1672,13 +1674,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 
 	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
 			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
+	status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
 	if (status)
 		goto bail;
 
 	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
 			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
+	status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
 	if (status)
 		goto bail;
 
diff --git a/trunk/fs/ocfs2/dlm/dlmlock.c b/trunk/fs/ocfs2/dlm/dlmlock.c
index 7009292aac5a..69cf369961c4 100644
--- a/trunk/fs/ocfs2/dlm/dlmlock.c
+++ b/trunk/fs/ocfs2/dlm/dlmlock.c
@@ -106,9 +106,6 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
 
 		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
 			return 0;
-		if (!dlm_lock_compatible(tmplock->ml.convert_type,
-					 lock->ml.type))
-			return 0;
 	}
 
 	return 1;
diff --git a/trunk/fs/ocfs2/dlm/dlmthread.c b/trunk/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c471..2211acf33d9b 100644
--- a/trunk/fs/ocfs2/dlm/dlmthread.c
+++ b/trunk/fs/ocfs2/dlm/dlmthread.c
@@ -122,13 +122,15 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			      struct dlm_lock_resource *res)
 {
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&res->spinlock);
 
 	if (__dlm_lockres_unused(res)){
 		if (list_empty(&res->purge)) {
-			mlog(0, "%s: Adding res %.*s to purge list\n",
-			     dlm->name, res->lockname.len, res->lockname.name);
+			mlog(0, "putting lockres %.*s:%p onto purge list\n",
+			     res->lockname.len, res->lockname.name, res);
 
 			res->last_used = jiffies;
 			dlm_lockres_get(res);
@@ -136,8 +138,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			dlm->purge_count++;
 		}
 	} else if (!list_empty(&res->purge)) {
-		mlog(0, "%s: Removing res %.*s from purge list\n",
-		     dlm->name, res->lockname.len, res->lockname.name);
+		mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
+		     res->lockname.len, res->lockname.name, res, res->owner);
 
 		list_del_init(&res->purge);
 		dlm_lockres_put(res);
@@ -148,6 +150,7 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			    struct dlm_lock_resource *res)
 {
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
 	spin_lock(&dlm->spinlock);
 	spin_lock(&res->spinlock);
 
@@ -168,8 +171,9 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 	master = (res->owner == dlm->node_num);
 
-	mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
-	     res->lockname.len, res->lockname.name, master);
+
+	mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
+	     res->lockname.name, master);
 
 	if (!master) {
 		res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -185,25 +189,27 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 		/* clear our bit from the master's refmap, ignore errors */
 		ret = dlm_drop_lockres_ref(dlm, res);
 		if (ret < 0) {
-			mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
-			     res->lockname.len, res->lockname.name, ret);
+			mlog_errno(ret);
 			if (!dlm_is_host_down(ret))
 				BUG();
 		}
+		mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name, ret);
 		spin_lock(&dlm->spinlock);
 		spin_lock(&res->spinlock);
 	}
 
 	if (!list_empty(&res->purge)) {
-		mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
-		     dlm->name, res->lockname.len, res->lockname.name, master);
+		mlog(0, "removing lockres %.*s:%p from purgelist, "
+		     "master = %d\n", res->lockname.len, res->lockname.name,
+		     res, master);
 		list_del_init(&res->purge);
 		dlm_lockres_put(res);
 		dlm->purge_count--;
 	}
 
 	if (!__dlm_lockres_unused(res)) {
-		mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+		mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
 		     dlm->name, res->lockname.len, res->lockname.name);
 		__dlm_print_one_lock_resource(res);
 		BUG();
@@ -260,10 +266,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 		unused = __dlm_lockres_unused(lockres);
 		if (!unused ||
 		    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-			mlog(0, "%s: res %.*s is in use or being remastered, "
-			     "used %d, state %d\n", dlm->name,
-			     lockres->lockname.len, lockres->lockname.name,
-			     !unused, lockres->state);
+			mlog(0, "lockres %s:%.*s: is in use or "
+			     "being remastered, used %d, state %d\n",
+			     dlm->name, lockres->lockname.len,
+			     lockres->lockname.name, !unused, lockres->state);
 			list_move_tail(&dlm->purge_list, &lockres->purge);
 			spin_unlock(&lockres->spinlock);
 			continue;
@@ -290,12 +296,15 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 	struct list_head *head;
 	int can_grant = 1;
 
-	/*
-	 * Because this function is called with the lockres
+	//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+	//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+	//mlog(0, "shuffle res %.*s\n", res->lockname.len,
+	//	  res->lockname.name);
+
+	/* because this function is called with the lockres
 	 * spinlock, and because we know that it is not migrating/
 	 * recovering/in-progress, it is fine to reserve asts and
-	 * basts right before queueing them all throughout
-	 */
+	 * basts right before queueing them all throughout */
 	assert_spin_locked(&dlm->ast_lock);
 	assert_spin_locked(&res->spinlock);
 	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -305,13 +314,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
 	if (list_empty(&res->converting))
 		goto blocked;
-	mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
-	     res->lockname.len, res->lockname.name);
+	mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+	     res->lockname.name);
 
 	target = list_entry(res->converting.next, struct dlm_lock, list);
 	if (target->ml.convert_type == LKM_IVMODE) {
-		mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
-		     dlm->name, res->lockname.len, res->lockname.name);
+		mlog(ML_ERROR, "%.*s: converting a lock with no "
+		     "convert_type!\n", res->lockname.len, res->lockname.name);
 		BUG();
 	}
 	head = &res->granted;
@@ -356,12 +365,9 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 		spin_lock(&target->spinlock);
 		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
 
-		mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
-		     "%d => %d, node %u\n", dlm->name, res->lockname.len,
-		     res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
-		     target->ml.type,
+		mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+		     "granting: %d, node: %u\n", res->lockname.len,
+		     res->lockname.name, target->ml.type,
 		     target->ml.convert_type, target->ml.node);
 
 		target->ml.type = target->ml.convert_type;
@@ -422,14 +428,11 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 		spin_lock(&target->spinlock);
 		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
 
-		mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
-		     "node %u\n", dlm->name, res->lockname.len,
-		     res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+		mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+		     "node: %u\n", res->lockname.len, res->lockname.name,
 		     target->ml.type, target->ml.node);
 
-		/* target->ml.type is already correct */
+		// target->ml.type is already correct
 		list_move_tail(&target->list, &res->granted);
 
 		BUG_ON(!target->lksb);
@@ -450,6 +453,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
 	if (res) {
 		spin_lock(&dlm->spinlock);
 		spin_lock(&res->spinlock);
@@ -462,6 +466,8 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&res->spinlock);
 
@@ -478,16 +484,13 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 			res->state |= DLM_LOCK_RES_DIRTY;
 		}
 	}
-
-	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
-	     res->lockname.name);
 }
 
 
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-	mlog(0, "Starting dlm_thread...\n");
+	mlog(0, "starting dlm thread...\n");
 
 	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
 	if (IS_ERR(dlm->dlm_thread_task)) {
@@ -502,7 +505,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
 	if (dlm->dlm_thread_task) {
-		mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
+		mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
 		kthread_stop(dlm->dlm_thread_task);
 		dlm->dlm_thread_task = NULL;
 	}
@@ -533,12 +536,7 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		/* get an extra ref on lock */
 		dlm_lock_get(lock);
 		res = lock->lockres;
-		mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
-		     "node %u\n", dlm->name, res->lockname.len,
-		     res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		     lock->ml.type, lock->ml.node);
+		mlog(0, "delivering an ast for this lockres\n");
 
 		BUG_ON(!lock->ast_pending);
 
@@ -559,9 +557,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		/* possible that another ast was queued while
 		 * we were delivering the last one */
 		if (!list_empty(&lock->ast_list)) {
-			mlog(0, "%s: res %.*s, AST queued while flushing last "
-			     "one\n", dlm->name, res->lockname.len,
-			     res->lockname.name);
+			mlog(0, "aha another ast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the ast_pending flag set.\n");
 		} else
 			lock->ast_pending = 0;
 
@@ -592,12 +590,8 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		dlm_lock_put(lock);
 		spin_unlock(&dlm->ast_lock);
 
-		mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
-		     "blocked %d, node %u\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		     hi, lock->ml.node);
+		mlog(0, "delivering a bast for this lockres "
+		     "(blocked = %d\n", hi);
 
 		if (lock->ml.node != dlm->node_num) {
 			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -611,9 +605,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		/* possible that another bast was queued while
 		 * we were delivering the last one */
 		if (!list_empty(&lock->bast_list)) {
-			mlog(0, "%s: res %.*s, BAST queued while flushing last "
-			     "one\n", dlm->name, res->lockname.len,
-			     res->lockname.name);
+			mlog(0, "aha another bast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the bast_pending flag set.\n");
 		} else
 			lock->bast_pending = 0;
 
@@ -681,12 +675,11 @@ static int dlm_thread(void *data)
 			spin_lock(&res->spinlock);
 			if (res->owner != dlm->node_num) {
 				__dlm_print_one_lock_resource(res);
-				mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
-				     " dirty %d\n", dlm->name,
-				     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
-				     !!(res->state & DLM_LOCK_RES_MIGRATING),
-				     !!(res->state & DLM_LOCK_RES_RECOVERING),
-				     !!(res->state & DLM_LOCK_RES_DIRTY));
+				mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+				     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
 			}
 			BUG_ON(res->owner != dlm->node_num);
 
@@ -700,8 +693,8 @@ static int dlm_thread(void *data)
 				res->state &= ~DLM_LOCK_RES_DIRTY;
 				spin_unlock(&res->spinlock);
 				spin_unlock(&dlm->ast_lock);
-				mlog(0, "%s: res %.*s, inprogress, delay list "
-				     "shuffle, state %d\n", dlm->name,
+				mlog(0, "delaying list shuffling for in-"
+				     "progress lockres %.*s, state=%d\n",
 				     res->lockname.len, res->lockname.name,
 				     res->state);
 				delay = 1;
@@ -713,6 +706,10 @@ static int dlm_thread(void *data)
 			 * spinlock and do NOT have the dlm lock.
 			 * safe to reserve/queue asts and run the lists. */
 
+			mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
+			     "res=%.*s\n", dlm->name,
+			     res->lockname.len, res->lockname.name);
+
 			/* called while holding lockres lock */
 			dlm_shuffle_lists(dlm, res);
 			res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -736,8 +733,7 @@ static int dlm_thread(void *data)
 			/* unlikely, but we may need to give time to
 			 * other tasks */
 			if (!--n) {
-				mlog(0, "%s: Throttling dlm thread\n",
-				     dlm->name);
+				mlog(0, "throttling dlm_thread\n");
 				break;
 			}
 		}
diff --git a/trunk/fs/ocfs2/namei.c b/trunk/fs/ocfs2/namei.c
index 30c523144452..d14cad6e2e41 100644
--- a/trunk/fs/ocfs2/namei.c
+++ b/trunk/fs/ocfs2/namei.c
@@ -1017,11 +1017,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 		 * An error return must mean that no cluster locks
 		 * were held on function exit.
 		 */
-		if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno != oi2->ip_blkno)
 			ocfs2_inode_unlock(inode2, 1);
-			brelse(*bh2);
-			*bh2 = NULL;
-		}
 
 		if (status != -ENOENT)
 			mlog_errno(status);
diff --git a/trunk/fs/ocfs2/ocfs2.h b/trunk/fs/ocfs2/ocfs2.h
index 51cd6898e7f1..70dd3b1798f1 100644
--- a/trunk/fs/ocfs2/ocfs2.h
+++ b/trunk/fs/ocfs2/ocfs2.h
@@ -420,11 +420,6 @@ struct ocfs2_super
 	struct inode			*osb_tl_inode;
 	struct buffer_head		*osb_tl_bh;
 	struct delayed_work		osb_truncate_log_wq;
-	/*
-	 * How many clusters in our truncate log.
-	 * It must be protected by osb_tl_inode->i_mutex.
-	 */
-	unsigned int truncated_clusters;
 
 	struct ocfs2_node_map		osb_recovering_orphan_dirs;
 	unsigned int			*osb_orphan_wipes;
diff --git a/trunk/fs/xfs/linux-2.6/sv.h b/trunk/fs/xfs/linux-2.6/sv.h
new file mode 100644
index 000000000000..4dfc7c370819
--- /dev/null
+++ b/trunk/fs/xfs/linux-2.6/sv.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_SV_H__
+#define __XFS_SUPPORT_SV_H__
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+/*
+ * Synchronisation variables.
+ *
+ * (Parameters "pri", "svf" and "rts" are not implemented)
+ */
+
+typedef struct sv_s {
+	wait_queue_head_t waiters;
+} sv_t;
+
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&sv->waiters, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(lock);
+
+	schedule();
+
+	remove_wait_queue(&sv->waiters, &wait);
+}
+
+#define sv_init(sv,flag,name) \
+	init_waitqueue_head(&(sv)->waiters)
+#define sv_destroy(sv) \
+	/*NOTHING*/
+#define sv_wait(sv, pri, lock, s) \
+	_sv_wait(sv, lock)
+#define sv_signal(sv) \
+	wake_up(&(sv)->waiters)
+#define sv_broadcast(sv) \
+	wake_up_all(&(sv)->waiters)
+
+#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/trunk/fs/xfs/linux-2.6/xfs_aops.c b/trunk/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b6..691f61223ed6 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_aops.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,6 +38,15 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+	IO_READ,	/* mapping for a read */
+	IO_DELAY,	/* mapping covers delalloc region */
+	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
+	IO_NEW		/* just allocated */
+};
 
 /*
  * Prime number of hash buckets since address is used as the key.
@@ -173,6 +182,9 @@ xfs_setfilesize(
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 
+	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+	ASSERT(ioend->io_type != IO_READ);
+
 	if (unlikely(ioend->io_error))
 		return 0;
 
@@ -232,8 +244,10 @@ xfs_end_io(
 	 * We might have to update the on-disk file size after extending
 	 * writes.
 	 */
-	error = xfs_setfilesize(ioend);
-	ASSERT(!error || error == EAGAIN);
+	if (ioend->io_type != IO_READ) {
+		error = xfs_setfilesize(ioend);
+		ASSERT(!error || error == EAGAIN);
+	}
 
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
@@ -304,63 +318,14 @@ STATIC int
 xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
+	ssize_t			count,
 	struct xfs_bmbt_irec	*imap,
-	int			type,
-	int			nonblocking)
+	int			flags)
 {
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			count = 1 << inode->i_blkbits;
-	xfs_fileoff_t		offset_fsb, end_fsb;
-	int			error = 0;
-	int			bmapi_flags = XFS_BMAPI_ENTIRE;
-	int			nimaps = 1;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
-
-	if (type == IO_UNWRITTEN)
-		bmapi_flags |= XFS_BMAPI_IGSTATE;
-
-	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-		if (nonblocking)
-			return -XFS_ERROR(EAGAIN);
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-	}
-
-	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       (ip->i_df.if_flags & XFS_IFEXTENTS));
-	ASSERT(offset <= mp->m_maxioffset);
-
-	if (offset + count > mp->m_maxioffset)
-		count = mp->m_maxioffset - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-			  bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	if (error)
-		return -XFS_ERROR(error);
-
-	if (type == IO_DELALLOC &&
-	    (!nimaps || isnullstartblock(imap->br_startblock))) {
-		error = xfs_iomap_write_allocate(ip, offset, count, imap);
-		if (!error)
-			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
-		return -XFS_ERROR(error);
-	}
+	int			nmaps = 1;
+	int			new = 0;
 
-#ifdef DEBUG
-	if (type == IO_UNWRITTEN) {
-		ASSERT(nimaps);
-		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-	}
-#endif
-	if (nimaps)
-		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
-	return 0;
+	return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 
 STATIC int
@@ -415,18 +380,26 @@ xfs_submit_ioend_bio(
 
 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
 		   WRITE_SYNC_PLUG : WRITE, bio);
+	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
+	bio_put(bio);
 }
 
 STATIC struct bio *
 xfs_alloc_ioend_bio(
 	struct buffer_head	*bh)
 {
+	struct bio		*bio;
 	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
-	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
+
+	do {
+		bio = bio_alloc(GFP_NOIO, nvecs);
+		nvecs >>= 1;
+	} while (!bio);
 
 	ASSERT(bio->bi_private == NULL);
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
+	bio_get(bio);
 	return bio;
 }
 
@@ -497,8 +470,9 @@ xfs_submit_ioend(
 	/* Pass 1 - start writeback */
 	do {
 		next = ioend->io_list;
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 			xfs_start_buffer_writeback(bh);
+		}
 	} while ((ioend = next) != NULL);
 
 	/* Pass 2 - submit I/O */
@@ -626,12 +600,116 @@ xfs_map_at_offset(
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
+	lock_buffer(bh);
 	xfs_map_buffer(inode, bh, imap, offset);
+	bh->b_bdev = xfs_find_bdev_for_inode(inode);
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 }
 
+/*
+ * Look for a page at index that is suitable for clustering.
+ */
+STATIC unsigned int
+xfs_probe_page(
+	struct page		*page,
+	unsigned int		pg_offset)
+{
+	struct buffer_head	*bh, *head;
+	int			ret = 0;
+
+	if (PageWriteback(page))
+		return 0;
+	if (!PageDirty(page))
+		return 0;
+	if (!page->mapping)
+		return 0;
+	if (!page_has_buffers(page))
+		return 0;
+
+	bh = head = page_buffers(page);
+	do {
+		if (!buffer_uptodate(bh))
+			break;
+		if (!buffer_mapped(bh))
+			break;
+		ret += bh->b_size;
+		if (ret >= pg_offset)
+			break;
+	} while ((bh = bh->b_this_page) != head);
+
+	return ret;
+}
+
+STATIC size_t
+xfs_probe_cluster(
+	struct inode		*inode,
+	struct page		*startpage,
+	struct buffer_head	*bh,
+	struct buffer_head	*head)
+{
+	struct pagevec		pvec;
+	pgoff_t			tindex, tlast, tloff;
+	size_t			total = 0;
+	int			done = 0, i;
+
+	/* First sum forwards in this page */
+	do {
+		if (!buffer_uptodate(bh) || !buffer_mapped(bh))
+			return total;
+		total += bh->b_size;
+	} while ((bh = bh->b_this_page) != head);
+
+	/* if we reached the end of the page, sum forwards in following pages */
+	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+	tindex = startpage->index + 1;
+
+	/* Prune this back to avoid pathological behavior */
+	tloff = min(tlast, startpage->index + 64);
+
+	pagevec_init(&pvec, 0);
+	while (!done && tindex <= tloff) {
+		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			size_t pg_offset, pg_len = 0;
+
+			if (tindex == tlast) {
+				pg_offset =
+				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
+				if (!pg_offset) {
+					done = 1;
+					break;
+				}
+			} else
+				pg_offset = PAGE_CACHE_SIZE;
+
+			if (page->index == tindex && trylock_page(page)) {
+				pg_len = xfs_probe_page(page, pg_offset);
+				unlock_page(page);
+			}
+
+			if (!pg_len) {
+				done = 1;
+				break;
+			}
+
+			total += pg_len;
+			tindex++;
+		}
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	return total;
+}
+
 /*
  * Test if a given page is suitable for writing as part of an unwritten
  * or delayed allocate extent.
@@ -653,9 +731,9 @@ xfs_is_delayed_page(
 			if (buffer_unwritten(bh))
 				acceptable = (type == IO_UNWRITTEN);
 			else if (buffer_delay(bh))
-				acceptable = (type == IO_DELALLOC);
+				acceptable = (type == IO_DELAY);
 			else if (buffer_dirty(bh) && buffer_mapped(bh))
-				acceptable = (type == IO_OVERWRITE);
+				acceptable = (type == IO_NEW);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
@@ -680,7 +758,8 @@ xfs_convert_page(
 	loff_t			tindex,
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
-	struct writeback_control *wbc)
+	struct writeback_control *wbc,
+	int			all_bh)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
@@ -735,30 +814,37 @@ xfs_convert_page(
 			continue;
 		}
 
-		if (buffer_unwritten(bh) || buffer_delay(bh) ||
-		    buffer_mapped(bh)) {
+		if (buffer_unwritten(bh) || buffer_delay(bh)) {
 			if (buffer_unwritten(bh))
 				type = IO_UNWRITTEN;
-			else if (buffer_delay(bh))
-				type = IO_DELALLOC;
 			else
-				type = IO_OVERWRITE;
+				type = IO_DELAY;
 
 			if (!xfs_imap_valid(inode, imap, offset)) {
 				done = 1;
 				continue;
 			}
 
-			lock_buffer(bh);
-			if (type != IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, imap, offset);
+			ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+			ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+			xfs_map_at_offset(inode, bh, imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type,
 					 ioendp, done);
 
 			page_dirty--;
 			count++;
 		} else {
-			done = 1;
+			type = IO_NEW;
+			if (buffer_mapped(bh) && all_bh) {
+				lock_buffer(bh);
+				xfs_add_to_ioend(inode, bh, offset,
+						type, ioendp, done);
+				count++;
+				page_dirty--;
+			} else {
+				done = 1;
+			}
 		}
 	} while (offset += len, (bh = bh->b_this_page) != head);
 
@@ -790,6 +876,7 @@ xfs_cluster_write(
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
+	int			all_bh,
 	pgoff_t			tlast)
 {
 	struct pagevec		pvec;
@@ -804,7 +891,7 @@ xfs_cluster_write(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					imap, ioendp, wbc);
+					imap, ioendp, wbc, all_bh);
 			if (done)
 				break;
 		}
@@ -848,7 +935,7 @@ xfs_aops_discard_page(
 	struct buffer_head	*bh, *head;
 	loff_t			offset = page_offset(page);
 
-	if (!xfs_is_delayed_page(page, IO_DELALLOC))
+	if (!xfs_is_delayed_page(page, IO_DELAY))
 		goto out_invalidate;
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -915,10 +1002,10 @@ xfs_vm_writepage(
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
-	ssize_t			len;
-	int			err, imap_valid = 0, uptodate = 1;
+	ssize_t			size, len;
+	int			flags, err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
-	int			nonblocking = 0;
+	int			all_bh = 0;
 
 	trace_xfs_writepage(inode, page, 0);
 
@@ -969,14 +1056,10 @@ xfs_vm_writepage(
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-	type = IO_OVERWRITE;
-
-	if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
-		nonblocking = 1;
+	flags = BMAPI_READ;
+	type = IO_NEW;
 
 	do {
-		int new_ioend = 0;
-
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
@@ -993,54 +1076,90 @@ xfs_vm_writepage(
 			continue;
 		}
 
-		if (buffer_unwritten(bh)) {
-			if (type != IO_UNWRITTEN) {
-				type = IO_UNWRITTEN;
+		if (imap_valid)
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+
+		if (buffer_unwritten(bh) || buffer_delay(bh)) {
+			int new_ioend = 0;
+
+			/*
+			 * Make sure we don't use a read-only iomap
+			 */
+			if (flags == BMAPI_READ)
 				imap_valid = 0;
+
+			if (buffer_unwritten(bh)) {
+				type = IO_UNWRITTEN;
+				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+			} else if (buffer_delay(bh)) {
+				type = IO_DELAY;
+				flags = BMAPI_ALLOCATE;
+
+				if (wbc->sync_mode == WB_SYNC_NONE)
+					flags |= BMAPI_TRYLOCK;
 			}
-		} else if (buffer_delay(bh)) {
-			if (type != IO_DELALLOC) {
-				type = IO_DELALLOC;
-				imap_valid = 0;
+
+			if (!imap_valid) {
+				/*
+				 * If we didn't have a valid mapping then we
+				 * need to ensure that we put the new mapping
+				 * in a new ioend structure. This needs to be
+				 * done to ensure that the ioends correctly
+				 * reflect the block mappings at io completion
+				 * for unwritten extent conversion.
+				 */
+				new_ioend = 1;
+				err = xfs_map_blocks(inode, offset, len,
+						&imap, flags);
+				if (err)
+					goto error;
+				imap_valid = xfs_imap_valid(inode, &imap,
+							    offset);
 			}
-		} else if (buffer_uptodate(bh)) {
-			if (type != IO_OVERWRITE) {
-				type = IO_OVERWRITE;
-				imap_valid = 0;
+			if (imap_valid) {
+				xfs_map_at_offset(inode, bh, &imap, offset);
+				xfs_add_to_ioend(inode, bh, offset, type,
+						 &ioend, new_ioend);
+				count++;
 			}
-		} else {
-			if (PageUptodate(page)) {
-				ASSERT(buffer_mapped(bh));
-				imap_valid = 0;
+		} else if (buffer_uptodate(bh)) {
+			/*
+			 * we got here because the buffer is already mapped.
+			 * That means it must already have extents allocated
+			 * underneath it. Map the extent by reading it.
+			 */
+			if (!imap_valid || flags != BMAPI_READ) {
+				flags = BMAPI_READ;
+				size = xfs_probe_cluster(inode, page, bh, head);
+				err = xfs_map_blocks(inode, offset, size,
+						&imap, flags);
+				if (err)
+					goto error;
+				imap_valid = xfs_imap_valid(inode, &imap,
+							    offset);
 			}
-			continue;
-		}
 
-		if (imap_valid)
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
-		if (!imap_valid) {
 			/*
-			 * If we didn't have a valid mapping then we need to
-			 * put the new mapping into a separate ioend structure.
-			 * This ensures non-contiguous extents always have
-			 * separate ioends, which is particularly important
-			 * for unwritten extent conversion at I/O completion
-			 * time.
+			 * We set the type to IO_NEW in case we are doing a
+			 * small write at EOF that is extending the file but
+			 * without needing an allocation. We need to update the
+			 * file size on I/O completion in this case so it is
+			 * the same case as having just allocated a new extent
+			 * that we are writing into for the first time.
 			 */
-			new_ioend = 1;
-			err = xfs_map_blocks(inode, offset, &imap, type,
-					     nonblocking);
-			if (err)
-				goto error;
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
-		}
-		if (imap_valid) {
-			lock_buffer(bh);
-			if (type != IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-					 new_ioend);
-			count++;
+			type = IO_NEW;
+			if (trylock_buffer(bh)) {
+				if (imap_valid)
+					all_bh = 1;
+				xfs_add_to_ioend(inode, bh, offset, type,
+						&ioend, !imap_valid);
+				count++;
+			} else {
+				imap_valid = 0;
+			}
+		} else if (PageUptodate(page)) {
+			ASSERT(buffer_mapped(bh));
+			imap_valid = 0;
 		}
 
 		if (!iohead)
@@ -1069,7 +1188,7 @@ xfs_vm_writepage(
 			end_index = last_index;
 
 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-				  wbc, end_index);
+					wbc, all_bh, end_index);
 	}
 
 	if (iohead)
@@ -1138,19 +1257,13 @@ __xfs_get_blocks(
 	int			create,
 	int			direct)
 {
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb, end_fsb;
-	int			error = 0;
-	int			lockmode = 0;
+	int			flags = create ? BMAPI_WRITE : BMAPI_READ;
 	struct xfs_bmbt_irec	imap;
-	int			nimaps = 1;
 	xfs_off_t		offset;
 	ssize_t			size;
+	int			nimap = 1;
 	int			new = 0;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+	int			error;
 
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1159,45 +1272,15 @@ __xfs_get_blocks(
 	if (!create && direct && offset >= i_size_read(inode))
 		return 0;
 
-	if (create) {
-		lockmode = XFS_ILOCK_EXCL;
-		xfs_ilock(ip, lockmode);
-	} else {
-		lockmode = xfs_ilock_map_shared(ip);
-	}
-
-	ASSERT(offset <= mp->m_maxioffset);
-	if (offset + size > mp->m_maxioffset)
-		size = mp->m_maxioffset - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	if (direct && create)
+		flags |= BMAPI_DIRECT;
 
-	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-			  XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
+	error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+			  &new);
 	if (error)
-		goto out_unlock;
-
-	if (create &&
-	    (!nimaps ||
-	     (imap.br_startblock == HOLESTARTBLOCK ||
-	      imap.br_startblock == DELAYSTARTBLOCK))) {
-		if (direct) {
-			error = xfs_iomap_write_direct(ip, offset, size,
-						       &imap, nimaps);
-		} else {
-			error = xfs_iomap_write_delay(ip, offset, size, &imap);
-		}
-		if (error)
-			goto out_unlock;
-
-		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
-	} else if (nimaps) {
-		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
-	} else {
-		trace_xfs_get_blocks_notfound(ip, offset, size);
-		goto out_unlock;
-	}
-	xfs_iunlock(ip, lockmode);
+		return -error;
+	if (nimap == 0)
+		return 0;
 
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1264,10 +1347,6 @@ __xfs_get_blocks(
 	}
 
 	return 0;
-
-out_unlock:
-	xfs_iunlock(ip, lockmode);
-	return -error;
 }
 
 int
@@ -1355,7 +1434,7 @@ xfs_vm_direct_IO(
 	ssize_t			ret;
 
 	if (rw & WRITE) {
-		iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+		iocb->private = xfs_alloc_ioend(inode, IO_NEW);
 
 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
diff --git a/trunk/fs/xfs/linux-2.6/xfs_aops.h b/trunk/fs/xfs/linux-2.6/xfs_aops.h
index 71f721e1a71f..c5057fb6237a 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_aops.h
+++ b/trunk/fs/xfs/linux-2.6/xfs_aops.h
@@ -22,22 +22,6 @@ extern struct workqueue_struct *xfsdatad_workqueue;
 extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-	IO_DIRECT = 0,	/* special case for direct I/O ioends */
-	IO_DELALLOC,	/* mapping covers delalloc region */
-	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
-	IO_OVERWRITE,	/* mapping covers already allocated extent */
-};
-
-#define XFS_IO_TYPES \
-	{ 0,			"" }, \
-	{ IO_DELALLOC,		"delalloc" }, \
-	{ IO_UNWRITTEN,		"unwritten" }, \
-	{ IO_OVERWRITE,		"overwrite" }
-
 /*
  * xfs_ioend struct manages large extent writes for XFS.
  * It can manage several multi-page bio's at once.
diff --git a/trunk/fs/xfs/linux-2.6/xfs_buf.c b/trunk/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2acc6ab..4c5deb6e9e31 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_buf.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,7 +44,12 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+static struct shrinker xfs_buf_shake = {
+	.shrink = xfsbufd_wakeup,
+	.seeks = DEFAULT_SEEKS,
+};
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -163,79 +168,8 @@ test_page_region(
 }
 
 /*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
+ *	Internal xfs_buf_t object manipulation
  */
-STATIC void
-xfs_buf_lru_add(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (list_empty(&bp->b_lru)) {
-		atomic_inc(&bp->b_hold);
-		list_add_tail(&bp->b_lru, &btp->bt_lru);
-		btp->bt_lru_nr++;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	if (list_empty(&bp->b_lru))
-		return;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (!list_empty(&bp->b_lru)) {
-		list_del_init(&bp->b_lru);
-		btp->bt_lru_nr--;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * When we mark a buffer stale, we remove the buffer from the LRU and clear the
- * b_lru_ref count so that the buffer is freed immediately when the buffer
- * reference count falls to zero. If the buffer is already on the LRU, we need
- * to remove the reference that LRU holds on the buffer.
- *
- * This prevents build-up of stale buffers on the LRU.
- */
-void
-xfs_buf_stale(
-	struct xfs_buf	*bp)
-{
-	bp->b_flags |= XBF_STALE;
-	atomic_set(&(bp)->b_lru_ref, 0);
-	if (!list_empty(&bp->b_lru)) {
-		struct xfs_buftarg *btp = bp->b_target;
-
-		spin_lock(&btp->bt_lru_lock);
-		if (!list_empty(&bp->b_lru)) {
-			list_del_init(&bp->b_lru);
-			btp->bt_lru_nr--;
-			atomic_dec(&bp->b_hold);
-		}
-		spin_unlock(&btp->bt_lru_lock);
-	}
-	ASSERT(atomic_read(&bp->b_hold) >= 1);
-}
 
 STATIC void
 _xfs_buf_initialize(
@@ -252,9 +186,7 @@ _xfs_buf_initialize(
 
 	memset(bp, 0, sizeof(xfs_buf_t));
 	atomic_set(&bp->b_hold, 1);
-	atomic_set(&bp->b_lru_ref, 1);
 	init_completion(&bp->b_iowait);
-	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_list);
 	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -330,8 +262,6 @@ xfs_buf_free(
 {
 	trace_xfs_buf_free(bp, _RET_IP_);
 
-	ASSERT(list_empty(&bp->b_lru));
-
 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 		uint		i;
 
@@ -407,6 +337,7 @@ _xfs_buf_lookup_pages(
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
+			xfsbufd_wakeup(NULL, 0, gfp_mask);
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
@@ -897,7 +828,6 @@ xfs_buf_rele(
 
 	if (!pag) {
 		ASSERT(!bp->b_relse);
-		ASSERT(list_empty(&bp->b_lru));
 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold))
 			xfs_buf_free(bp);
@@ -905,19 +835,13 @@ xfs_buf_rele(
 	}
 
 	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
 		if (bp->b_relse) {
 			atomic_inc(&bp->b_hold);
 			spin_unlock(&pag->pag_buf_lock);
 			bp->b_relse(bp);
-		} else if (!(bp->b_flags & XBF_STALE) &&
-			   atomic_read(&bp->b_lru_ref)) {
-			xfs_buf_lru_add(bp);
-			spin_unlock(&pag->pag_buf_lock);
 		} else {
-			xfs_buf_lru_del(bp);
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 			spin_unlock(&pag->pag_buf_lock);
@@ -1514,84 +1438,51 @@ xfs_buf_iomove(
  */
 
 /*
- * Wait for any bufs with callbacks that have been submitted but have not yet
- * returned. These buffers will have an elevated hold count, so wait on those
- * while freeing all the buffers only held by the LRU.
+ *	Wait for any bufs with callbacks that have been submitted but
+ *	have not yet returned... walk the hash list for the target.
  */
 void
 xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct xfs_buf		*bp;
+	struct xfs_perag	*pag;
+	uint			i;
 
-restart:
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-		if (atomic_read(&bp->b_hold) > 1) {
-			spin_unlock(&btp->bt_lru_lock);
+	for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+		pag = xfs_perag_get(btp->bt_mount, i);
+		spin_lock(&pag->pag_buf_lock);
+		while (rb_first(&pag->pag_buf_tree)) {
+			spin_unlock(&pag->pag_buf_lock);
 			delay(100);
-			goto restart;
+			spin_lock(&pag->pag_buf_lock);
 		}
-		/*
-		 * clear the LRU reference count so the bufer doesn't get
-		 * ignored in xfs_buf_rele().
-		 */
-		atomic_set(&bp->b_lru_ref, 0);
-		spin_unlock(&btp->bt_lru_lock);
-		xfs_buf_rele(bp);
-		spin_lock(&btp->bt_lru_lock);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
 	}
-	spin_unlock(&btp->bt_lru_lock);
 }
 
-int
-xfs_buftarg_shrink(
-	struct shrinker		*shrink,
-	int			nr_to_scan,
-	gfp_t			mask)
-{
-	struct xfs_buftarg	*btp = container_of(shrink,
-					struct xfs_buftarg, bt_shrinker);
-	struct xfs_buf		*bp;
-	LIST_HEAD(dispose);
-
-	if (!nr_to_scan)
-		return btp->bt_lru_nr;
-
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		if (nr_to_scan-- <= 0)
-			break;
-
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
-		/*
-		 * Decrement the b_lru_ref count unless the value is already
-		 * zero. If the value is already zero, we need to reclaim the
-		 * buffer, otherwise it gets another trip through the LRU.
-		 */
-		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			continue;
-		}
-
-		/*
-		 * remove the buffer from the LRU now to avoid needing another
-		 * lock round trip inside xfs_buf_rele().
-		 */
-		list_move(&bp->b_lru, &dispose);
-		btp->bt_lru_nr--;
-	}
-	spin_unlock(&btp->bt_lru_lock);
+/*
+ *	buftarg list for delwrite queue processing
+ */
+static LIST_HEAD(xfs_buftarg_list);
+static DEFINE_SPINLOCK(xfs_buftarg_lock);
 
-	while (!list_empty(&dispose)) {
-		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
-		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
-	}
+STATIC void
+xfs_register_buftarg(
+	xfs_buftarg_t           *btp)
+{
+	spin_lock(&xfs_buftarg_lock);
+	list_add(&btp->bt_list, &xfs_buftarg_list);
+	spin_unlock(&xfs_buftarg_lock);
+}
 
-	return btp->bt_lru_nr;
+STATIC void
+xfs_unregister_buftarg(
+	xfs_buftarg_t           *btp)
+{
+	spin_lock(&xfs_buftarg_lock);
+	list_del(&btp->bt_list);
+	spin_unlock(&xfs_buftarg_lock);
 }
 
 void
@@ -1599,14 +1490,17 @@ xfs_free_buftarg(
 	struct xfs_mount	*mp,
 	struct xfs_buftarg	*btp)
 {
-	unregister_shrinker(&btp->bt_shrinker);
-
 	xfs_flush_buftarg(btp, 1);
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
 	iput(btp->bt_mapping->host);
 
+	/* Unregister the buftarg first so that we don't get a
+	 * wakeup finding a non-existent task
+	 */
+	xfs_unregister_buftarg(btp);
 	kthread_stop(btp->bt_task);
+
 	kmem_free(btp);
 }
 
@@ -1703,13 +1597,20 @@ xfs_alloc_delwrite_queue(
 	xfs_buftarg_t		*btp,
 	const char		*fsname)
 {
+	int	error = 0;
+
+	INIT_LIST_HEAD(&btp->bt_list);
 	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
 	spin_lock_init(&btp->bt_delwrite_lock);
 	btp->bt_flags = 0;
 	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-	if (IS_ERR(btp->bt_task))
-		return PTR_ERR(btp->bt_task);
-	return 0;
+	if (IS_ERR(btp->bt_task)) {
+		error = PTR_ERR(btp->bt_task);
+		goto out_error;
+	}
+	xfs_register_buftarg(btp);
+out_error:
+	return error;
 }
 
 xfs_buftarg_t *
@@ -1726,17 +1627,12 @@ xfs_alloc_buftarg(
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
-	INIT_LIST_HEAD(&btp->bt_lru);
-	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
 	if (xfs_mapping_buftarg(btp, bdev))
 		goto error;
 	if (xfs_alloc_delwrite_queue(btp, fsname))
 		goto error;
-	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
-	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
-	register_shrinker(&btp->bt_shrinker);
 	return btp;
 
 error:
@@ -1841,6 +1737,27 @@ xfs_buf_runall_queues(
 	flush_workqueue(queue);
 }
 
+STATIC int
+xfsbufd_wakeup(
+	struct shrinker		*shrink,
+	int			priority,
+	gfp_t			mask)
+{
+	xfs_buftarg_t		*btp;
+
+	spin_lock(&xfs_buftarg_lock);
+	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
+		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
+			continue;
+		if (list_empty(&btp->bt_delwrite_queue))
+			continue;
+		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
+		wake_up_process(btp->bt_task);
+	}
+	spin_unlock(&xfs_buftarg_lock);
+	return 0;
+}
+
 /*
  * Move as many buffers as specified to the supplied list
  * idicating if we skipped any buffers to prevent deadlocks.
@@ -2035,6 +1952,7 @@ xfs_buf_init(void)
 	if (!xfsconvertd_workqueue)
 		goto out_destroy_xfsdatad_workqueue;
 
+	register_shrinker(&xfs_buf_shake);
 	return 0;
 
  out_destroy_xfsdatad_workqueue:
@@ -2050,6 +1968,7 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
+	unregister_shrinker(&xfs_buf_shake);
 	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
diff --git a/trunk/fs/xfs/linux-2.6/xfs_buf.h b/trunk/fs/xfs/linux-2.6/xfs_buf.h
index a76c2428faff..383a3f37cf98 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_buf.h
+++ b/trunk/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,15 +128,10 @@ typedef struct xfs_buftarg {
 
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
+	struct list_head	bt_list;
 	struct list_head	bt_delwrite_queue;
 	spinlock_t		bt_delwrite_lock;
 	unsigned long		bt_flags;
-
-	/* LRU control structures */
-	struct shrinker		bt_shrinker;
-	struct list_head	bt_lru;
-	spinlock_t		bt_lru_lock;
-	unsigned int		bt_lru_nr;
 } xfs_buftarg_t;
 
 /*
@@ -169,11 +164,9 @@ typedef struct xfs_buf {
 	xfs_off_t		b_file_offset;	/* offset in file */
 	size_t			b_buffer_length;/* size of buffer in bytes */
 	atomic_t		b_hold;		/* reference count */
-	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
 
-	struct list_head	b_lru;		/* lru list */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
@@ -271,8 +264,7 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
 		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
 
-void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_STALE(bp)	xfs_buf_stale(bp);
+#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XBF_STALE)
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp)	do {				\
@@ -336,15 +328,9 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
 
-static inline void
-xfs_buf_set_ref(
-	struct xfs_buf	*bp,
-	int		lru_ref)
-{
-	atomic_set(&bp->b_lru_ref, lru_ref);
-}
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	xfs_buf_set_ref(bp, ref)
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	do { } while (0)
 #define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
+#define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
 
 #define XFS_BUF_ISPINNED(bp)	atomic_read(&((bp)->b_pin_count))
 
diff --git a/trunk/fs/xfs/linux-2.6/xfs_export.c b/trunk/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..3764d74790ec 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_export.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_export.c
@@ -70,16 +70,8 @@ xfs_fs_encode_fh(
 	else
 		fileid_type = FILEID_INO32_GEN_PARENT;
 
-	/*
-	 * If the the filesystem may contain 64bit inode numbers, we need
-	 * to use larger file handles that can represent them.
-	 *
-	 * While we only allocate inodes that do not fit into 32 bits any
-	 * large enough filesystem may contain them, thus the slightly
-	 * confusing looking conditional below.
-	 */
-	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
-	    (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+	/* filesystem may contain 64bit inode numbers */
+	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
 		fileid_type |= XFS_FILEID_TYPE_64FLAG;
 
 	/*
diff --git a/trunk/fs/xfs/linux-2.6/xfs_linux.h b/trunk/fs/xfs/linux-2.6/xfs_linux.h
index 096494997747..214ddd71ff79 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_linux.h
+++ b/trunk/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,6 +37,7 @@
 
 #include <kmem.h>
 #include <mrlock.h>
+#include <sv.h>
 #include <time.h>
 
 #include <support/debug.h>
diff --git a/trunk/fs/xfs/linux-2.6/xfs_super.c b/trunk/fs/xfs/linux-2.6/xfs_super.c
index c51faaa5e291..064f964d4f3c 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_super.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_super.c
@@ -834,11 +834,8 @@ xfsaild_wakeup(
 	struct xfs_ail		*ailp,
 	xfs_lsn_t		threshold_lsn)
 {
-	/* only ever move the target forwards */
-	if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
-		ailp->xa_target = threshold_lsn;
-		wake_up_process(ailp->xa_task);
-	}
+	ailp->xa_target = threshold_lsn;
+	wake_up_process(ailp->xa_task);
 }
 
 STATIC int
@@ -850,17 +847,8 @@ xfsaild(
 	long		tout = 0; /* milliseconds */
 
 	while (!kthread_should_stop()) {
-		/*
-		 * for short sleeps indicating congestion, don't allow us to
-		 * get woken early. Otherwise all we do is bang on the AIL lock
-		 * without making progress.
-		 */
-		if (tout && tout <= 20)
-			__set_current_state(TASK_KILLABLE);
-		else
-			__set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(tout ?
-				 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+		schedule_timeout_interruptible(tout ?
+				msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
 
 		/* swsusp */
 		try_to_freeze();
@@ -1130,8 +1118,6 @@ xfs_fs_evict_inode(
 	 */
 	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
-			&xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
 
 	xfs_inactive(ip);
 }
diff --git a/trunk/fs/xfs/linux-2.6/xfs_sync.c b/trunk/fs/xfs/linux-2.6/xfs_sync.c
index a02480de9759..afb0d7cfad1c 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_sync.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,30 +53,14 @@ xfs_inode_ag_walk_grab(
 {
 	struct inode		*inode = VFS_I(ip);
 
-	ASSERT(rcu_read_lock_held());
-
-	/*
-	 * check for stale RCU freed inode
-	 *
-	 * If the inode has been reallocated, it doesn't matter if it's not in
-	 * the AG we are walking - we are walking for writeback, so if it
-	 * passes all the "valid inode" checks and is dirty, then we'll write
-	 * it back anyway.  If it has been reallocated and still being
-	 * initialised, the XFS_INEW check below will catch it.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!ip->i_ino)
-		goto out_unlock_noent;
-
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		goto out_unlock_noent;
-	spin_unlock(&ip->i_flags_lock);
-
 	/* nothing to sync during shutdown */
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return EFSCORRUPTED;
 
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		return ENOENT;
+
 	/* If we can't grab the inode, it must on it's way to reclaim. */
 	if (!igrab(inode))
 		return ENOENT;
@@ -88,10 +72,6 @@ xfs_inode_ag_walk_grab(
 
 	/* inode is valid */
 	return 0;
-
-out_unlock_noent:
-	spin_unlock(&ip->i_flags_lock);
-	return ENOENT;
 }
 
 STATIC int
@@ -118,12 +98,12 @@ xfs_inode_ag_walk(
 		int		error = 0;
 		int		i;
 
-		rcu_read_lock();
+		read_lock(&pag->pag_ici_lock);
 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH);
 		if (!nr_found) {
-			rcu_read_unlock();
+			read_unlock(&pag->pag_ici_lock);
 			break;
 		}
 
@@ -138,26 +118,18 @@ xfs_inode_ag_walk(
 				batch[i] = NULL;
 
 			/*
-			 * Update the index for the next lookup. Catch
-			 * overflows into the next AG range which can occur if
-			 * we have inodes in the last block of the AG and we
-			 * are currently pointing to the last inode.
-			 *
-			 * Because we may see inodes that are from the wrong AG
-			 * due to RCU freeing and reallocation, only update the
-			 * index if it lies in this AG. It was a race that lead
-			 * us to see this inode, so another lookup from the
-			 * same index will not find it again.
+			 * Update the index for the next lookup. Catch overflows
+			 * into the next AG range which can occur if we have inodes
+			 * in the last block of the AG and we are currently
+			 * pointing to the last inode.
 			 */
-			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
-				continue;
 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 				done = 1;
 		}
 
 		/* unlock now we've grabbed the inodes. */
-		rcu_read_unlock();
+		read_unlock(&pag->pag_ici_lock);
 
 		for (i = 0; i < nr_found; i++) {
 			if (!batch[i])
@@ -620,12 +592,12 @@ xfs_inode_set_reclaim_tag(
 	struct xfs_perag *pag;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	spin_lock(&pag->pag_ici_lock);
+	write_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
 	__xfs_inode_set_reclaim_tag(pag, ip);
 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 	spin_unlock(&ip->i_flags_lock);
-	spin_unlock(&pag->pag_ici_lock);
+	write_unlock(&pag->pag_ici_lock);
 	xfs_perag_put(pag);
 }
 
@@ -667,14 +639,9 @@ xfs_reclaim_inode_grab(
 	struct xfs_inode	*ip,
 	int			flags)
 {
-	ASSERT(rcu_read_lock_held());
-
-	/* quick check for stale RCU freed inode */
-	if (!ip->i_ino)
-		return 1;
 
 	/*
-	 * do some unlocked checks first to avoid unnecessary lock traffic.
+	 * do some unlocked checks first to avoid unnecceary lock traffic.
 	 * The first is a flush lock check, the second is a already in reclaim
 	 * check. Only do these checks if we are not going to block on locks.
 	 */
@@ -687,16 +654,11 @@ xfs_reclaim_inode_grab(
 	 * The radix tree lock here protects a thread in xfs_iget from racing
 	 * with us starting reclaim on the inode.  Once we have the
 	 * XFS_IRECLAIM flag set it will not touch us.
-	 *
-	 * Due to RCU lookup, we may find inodes that have been freed and only
-	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-	 * aren't candidates for reclaim at all, so we must check the
-	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
 	 */
 	spin_lock(&ip->i_flags_lock);
-	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* not a reclaim candidate. */
+	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* ignore as it is already under reclaim */
 		spin_unlock(&ip->i_flags_lock);
 		return 1;
 	}
@@ -833,12 +795,12 @@ xfs_reclaim_inode(
 	 * added to the tree assert that it's been there before to catch
 	 * problems with the inode life time early on.
 	 */
-	spin_lock(&pag->pag_ici_lock);
+	write_lock(&pag->pag_ici_lock);
 	if (!radix_tree_delete(&pag->pag_ici_root,
 				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
 		ASSERT(0);
 	__xfs_inode_clear_reclaim(pag, ip);
-	spin_unlock(&pag->pag_ici_lock);
+	write_unlock(&pag->pag_ici_lock);
 
 	/*
 	 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -902,14 +864,14 @@ xfs_reclaim_inodes_ag(
 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 			int	i;
 
-			rcu_read_lock();
+			write_lock(&pag->pag_ici_lock);
 			nr_found = radix_tree_gang_lookup_tag(
 					&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH,
 					XFS_ICI_RECLAIM_TAG);
 			if (!nr_found) {
-				rcu_read_unlock();
+				write_unlock(&pag->pag_ici_lock);
 				break;
 			}
 
@@ -929,24 +891,14 @@ xfs_reclaim_inodes_ag(
 				 * occur if we have inodes in the last block of
 				 * the AG and we are currently pointing to the
 				 * last inode.
-				 *
-				 * Because we may see inodes that are from the
-				 * wrong AG due to RCU freeing and
-				 * reallocation, only update the index if it
-				 * lies in this AG. It was a race that lead us
-				 * to see this inode, so another lookup from
-				 * the same index will not find it again.
 				 */
-				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
-								pag->pag_agno)
-					continue;
 				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 					done = 1;
 			}
 
 			/* unlock now we've grabbed the inodes. */
-			rcu_read_unlock();
+			write_unlock(&pag->pag_ici_lock);
 
 			for (i = 0; i < nr_found; i++) {
 				if (!batch[i])
diff --git a/trunk/fs/xfs/linux-2.6/xfs_trace.h b/trunk/fs/xfs/linux-2.6/xfs_trace.h
index 647af2a2e7aa..acef2e98c594 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_trace.h
+++ b/trunk/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		__field(int, curr_res)
 		__field(int, unit_res)
 		__field(unsigned int, flags)
-		__field(int, reserveq)
-		__field(int, writeq)
+		__field(void *, reserve_headq)
+		__field(void *, write_headq)
 		__field(int, grant_reserve_cycle)
 		__field(int, grant_reserve_bytes)
 		__field(int, grant_write_cycle)
@@ -784,21 +784,19 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		__entry->curr_res = tic->t_curr_res;
 		__entry->unit_res = tic->t_unit_res;
 		__entry->flags = tic->t_flags;
-		__entry->reserveq = list_empty(&log->l_reserveq);
-		__entry->writeq = list_empty(&log->l_writeq);
-		xlog_crack_grant_head(&log->l_grant_reserve_head,
-				&__entry->grant_reserve_cycle,
-				&__entry->grant_reserve_bytes);
-		xlog_crack_grant_head(&log->l_grant_write_head,
-				&__entry->grant_write_cycle,
-				&__entry->grant_write_bytes);
+		__entry->reserve_headq = log->l_reserve_headq;
+		__entry->write_headq = log->l_write_headq;
+		__entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+		__entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+		__entry->grant_write_cycle = log->l_grant_write_cycle;
+		__entry->grant_write_bytes = log->l_grant_write_bytes;
 		__entry->curr_cycle = log->l_curr_cycle;
 		__entry->curr_block = log->l_curr_block;
-		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
+		__entry->tail_lsn = log->l_tail_lsn;
 	),
 	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-		  "t_unit_res %u t_flags %s reserveq %s "
-		  "writeq %s grant_reserve_cycle %d "
+		  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+		  "write_headq 0x%p grant_reserve_cycle %d "
 		  "grant_reserve_bytes %d grant_write_cycle %d "
 		  "grant_write_bytes %d curr_cycle %d curr_block %d "
 		  "tail_cycle %d tail_block %d",
@@ -809,8 +807,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		  __entry->curr_res,
 		  __entry->unit_res,
 		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-		  __entry->reserveq ? "empty" : "active",
-		  __entry->writeq ? "empty" : "active",
+		  __entry->reserve_headq,
+		  __entry->write_headq,
 		  __entry->grant_reserve_cycle,
 		  __entry->grant_reserve_bytes,
 		  __entry->grant_write_cycle,
@@ -837,7 +835,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -845,7 +842,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -939,10 +935,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
 
-DECLARE_EVENT_CLASS(xfs_imap_class,
+DECLARE_EVENT_CLASS(xfs_iomap_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-		 int type, struct xfs_bmbt_irec *irec),
-	TP_ARGS(ip, offset, count, type, irec),
+		 int flags, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, flags, irec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
@@ -950,7 +946,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
-		__field(int, type)
+		__field(int, flags)
 		__field(xfs_fileoff_t, startoff)
 		__field(xfs_fsblock_t, startblock)
 		__field(xfs_filblks_t, blockcount)
@@ -962,13 +958,13 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
-		__entry->type = type;
+		__entry->flags = flags;
 		__entry->startoff = irec ? irec->br_startoff : 0;
 		__entry->startblock = irec ? irec->br_startblock : 0;
 		__entry->blockcount = irec ? irec->br_blockcount : 0;
 	),
 	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-		  "offset 0x%llx count %zd type %s "
+		  "offset 0x%llx count %zd flags %s "
 		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
@@ -976,21 +972,20 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
-		  __print_symbolic(__entry->type, XFS_IO_TYPES),
+		  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
 		  __entry->startoff,
 		  (__int64_t)__entry->startblock,
 		  __entry->blockcount)
 )
 
 #define DEFINE_IOMAP_EVENT(name)	\
-DEFINE_EVENT(xfs_imap_class, name,	\
+DEFINE_EVENT(xfs_iomap_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
-		 int type, struct xfs_bmbt_irec *irec),		\
-	TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+		 int flags, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, flags, irec))
+DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1027,7 +1022,6 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
 	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
-DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 
 
 TRACE_EVENT(xfs_itruncate_start,
@@ -1426,7 +1420,6 @@ DEFINE_EVENT(xfs_alloc_class, name, \
 	TP_PROTO(struct xfs_alloc_arg *args), \
 	TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
diff --git a/trunk/fs/xfs/quota/xfs_dquot.c b/trunk/fs/xfs/quota/xfs_dquot.c
index d22aa3103106..faf8e1a83a12 100644
--- a/trunk/fs/xfs/quota/xfs_dquot.c
+++ b/trunk/fs/xfs/quota/xfs_dquot.c
@@ -149,6 +149,7 @@ xfs_qm_dqdestroy(
 	ASSERT(list_empty(&dqp->q_freelist));
 
 	mutex_destroy(&dqp->q_qlock);
+	sv_destroy(&dqp->q_pinwait);
 	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
 
 	atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/trunk/fs/xfs/xfs_ag.h b/trunk/fs/xfs/xfs_ag.h
index 58632cc17f2d..63c7a1a6c022 100644
--- a/trunk/fs/xfs/xfs_ag.h
+++ b/trunk/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
-	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
+	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
diff --git a/trunk/fs/xfs/xfs_alloc.c b/trunk/fs/xfs/xfs_alloc.c
index fa8723f5870a..112abc439ca5 100644
--- a/trunk/fs/xfs/xfs_alloc.c
+++ b/trunk/fs/xfs/xfs_alloc.c
@@ -577,58 +577,61 @@ xfs_alloc_ag_vextent_exact(
 	xfs_extlen_t	rlen;	/* length of returned extent */
 
 	ASSERT(args->alignment == 1);
-
 	/*
 	 * Allocate/initialize a cursor for the by-number freespace btree.
 	 */
 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-					  args->agno, XFS_BTNUM_BNO);
-
+		args->agno, XFS_BTNUM_BNO);
 	/*
 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
 	 * Look for the closest free block <= bno, it must contain bno
 	 * if any free block does.
 	 */
-	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
-	if (error)
+	if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
 		goto error0;
-	if (!i)
-		goto not_found;
-
+	if (!i) {
+		/*
+		 * Didn't find it, return null.
+		 */
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
 	/*
 	 * Grab the freespace record.
 	 */
-	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
-	if (error)
+	if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
 		goto error0;
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	ASSERT(fbno <= args->agbno);
 	minend = args->agbno + args->minlen;
 	maxend = args->agbno + args->maxlen;
 	fend = fbno + flen;
-
 	/*
 	 * Give up if the freespace isn't long enough for the minimum request.
 	 */
-	if (fend < minend)
-		goto not_found;
-
+	if (fend < minend) {
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
 	/*
 	 * End of extent will be smaller of the freespace end and the
 	 * maximal requested end.
-	 *
-	 * Fix the length according to mod and prod if given.
 	 */
 	end = XFS_AGBLOCK_MIN(fend, maxend);
+	/*
+	 * Fix the length according to mod and prod if given.
+	 */
 	args->len = end - args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args))
-		goto not_found;
-
+	if (!xfs_alloc_fix_minleft(args)) {
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
 	rlen = args->len;
 	ASSERT(args->agbno + rlen <= fend);
 	end = args->agbno + rlen;
-
 	/*
 	 * We are allocating agbno for rlen [agbno .. end]
 	 * Allocate/initialize a cursor for the by-size btree.
@@ -637,25 +640,16 @@ xfs_alloc_ag_vextent_exact(
 		args->agno, XFS_BTNUM_CNT);
 	ASSERT(args->agbno + args->len <=
 		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-				      args->len, XFSA_FIXUP_BNO_OK);
-	if (error) {
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 		goto error0;
 	}
-
 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 
-	args->wasfromfl = 0;
 	trace_xfs_alloc_exact_done(args);
-	return 0;
-
-not_found:
-	/* Didn't find it, return null. */
-	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-	args->agbno = NULLAGBLOCK;
-	trace_xfs_alloc_exact_notfound(args);
+	args->wasfromfl = 0;
 	return 0;
 
 error0:
@@ -664,95 +658,6 @@ xfs_alloc_ag_vextent_exact(
 	return error;
 }
 
-/*
- * Search the btree in a given direction via the search cursor and compare
- * the records found against the good extent we've already found.
- */
-STATIC int
-xfs_alloc_find_best_extent(
-	struct xfs_alloc_arg	*args,	/* allocation argument structure */
-	struct xfs_btree_cur	**gcur,	/* good cursor */
-	struct xfs_btree_cur	**scur,	/* searching cursor */
-	xfs_agblock_t		gdiff,	/* difference for search comparison */
-	xfs_agblock_t		*sbno,	/* extent found by search */
-	xfs_extlen_t		*slen,
-	xfs_extlen_t		*slena,	/* aligned length */
-	int			dir)	/* 0 = search right, 1 = search left */
-{
-	xfs_agblock_t		bno;
-	xfs_agblock_t		new;
-	xfs_agblock_t		sdiff;
-	int			error;
-	int			i;
-
-	/* The good extent is perfect, no need to  search. */
-	if (!gdiff)
-		goto out_use_good;
-
-	/*
-	 * Look until we find a better one, run out of space or run off the end.
-	 */
-	do {
-		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
-					  args->minlen, &bno, slena);
-
-		/*
-		 * The good extent is closer than this one.
-		 */
-		if (!dir) {
-			if (bno >= args->agbno + gdiff)
-				goto out_use_good;
-		} else {
-			if (bno <= args->agbno - gdiff)
-				goto out_use_good;
-		}
-
-		/*
-		 * Same distance, compare length and pick the best.
-		 */
-		if (*slena >= args->minlen) {
-			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
-			xfs_alloc_fix_len(args);
-
-			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-						       args->alignment, *sbno,
-						       *slen, &new);
-
-			/*
-			 * Choose closer size and invalidate other cursor.
-			 */
-			if (sdiff < gdiff)
-				goto out_use_search;
-			goto out_use_good;
-		}
-
-		if (!dir)
-			error = xfs_btree_increment(*scur, 0, &i);
-		else
-			error = xfs_btree_decrement(*scur, 0, &i);
-		if (error)
-			goto error0;
-	} while (i);
-
-out_use_good:
-	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
-	*scur = NULL;
-	return 0;
-
-out_use_search:
-	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
-	*gcur = NULL;
-	return 0;
-
-error0:
-	/* caller invalidates cursors */
-	return error;
-}
-
 /*
  * Allocate a variable extent near bno in the allocation group agno.
  * Extent's length (returned in len) will be between minlen and maxlen,
@@ -1020,45 +925,203 @@ xfs_alloc_ag_vextent_near(
 			}
 		}
 	} while (bno_cur_lt || bno_cur_gt);
-
 	/*
 	 * Got both cursors still active, need to find better entry.
 	 */
 	if (bno_cur_lt && bno_cur_gt) {
+		/*
+		 * Left side is long enough, look for a right side entry.
+		 */
 		if (ltlena >= args->minlen) {
 			/*
-			 * Left side is good, look for a right side entry.
+			 * Fix up the length.
 			 */
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+			rlen = args->len;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
 				args->alignment, ltbno, ltlen, &ltnew);
-
-			error = xfs_alloc_find_best_extent(args,
-						&bno_cur_lt, &bno_cur_gt,
-						ltdiff, &gtbno, &gtlen, &gtlena,
-						0 /* search right */);
-		} else {
-			ASSERT(gtlena >= args->minlen);
-
 			/*
-			 * Right side is good, look for a left side entry.
+			 * Not perfect.
+			 */
+			if (ltdiff) {
+				/*
+				 * Look until we find a better one, run out of
+				 * space, or run off the end.
+				 */
+				while (bno_cur_lt && bno_cur_gt) {
+					if ((error = xfs_alloc_get_rec(
+							bno_cur_gt, &gtbno,
+							&gtlen, &i)))
+						goto error0;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+					xfs_alloc_compute_aligned(gtbno, gtlen,
+						args->alignment, args->minlen,
+						&gtbnoa, &gtlena);
+					/*
+					 * The left one is clearly better.
+					 */
+					if (gtbnoa >= args->agbno + ltdiff) {
+						xfs_btree_del_cursor(
+							bno_cur_gt,
+							XFS_BTREE_NOERROR);
+						bno_cur_gt = NULL;
+						break;
+					}
+					/*
+					 * If we reach a big enough entry,
+					 * compare the two and pick the best.
+					 */
+					if (gtlena >= args->minlen) {
+						args->len =
+							XFS_EXTLEN_MIN(gtlena,
+								args->maxlen);
+						xfs_alloc_fix_len(args);
+						rlen = args->len;
+						gtdiff = xfs_alloc_compute_diff(
+							args->agbno, rlen,
+							args->alignment,
+							gtbno, gtlen, &gtnew);
+						/*
+						 * Right side is better.
+						 */
+						if (gtdiff < ltdiff) {
+							xfs_btree_del_cursor(
+								bno_cur_lt,
+								XFS_BTREE_NOERROR);
+							bno_cur_lt = NULL;
+						}
+						/*
+						 * Left side is better.
+						 */
+						else {
+							xfs_btree_del_cursor(
+								bno_cur_gt,
+								XFS_BTREE_NOERROR);
+							bno_cur_gt = NULL;
+						}
+						break;
+					}
+					/*
+					 * Fell off the right end.
+					 */
+					if ((error = xfs_btree_increment(
+							bno_cur_gt, 0, &i)))
+						goto error0;
+					if (!i) {
+						xfs_btree_del_cursor(
+							bno_cur_gt,
+							XFS_BTREE_NOERROR);
+						bno_cur_gt = NULL;
+						break;
+					}
+				}
+			}
+			/*
+			 * The left side is perfect, trash the right side.
+			 */
+			else {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+		/*
+		 * It's the right side that was found first, look left.
+		 */
+		else {
+			/*
+			 * Fix up the length.
 			 */
 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+			rlen = args->len;
+			gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
 				args->alignment, gtbno, gtlen, &gtnew);
-
-			error = xfs_alloc_find_best_extent(args,
-						&bno_cur_gt, &bno_cur_lt,
-						gtdiff, &ltbno, &ltlen, &ltlena,
-						1 /* search left */);
+			/*
+			 * Right side entry isn't perfect.
+			 */
+			if (gtdiff) {
+				/*
+				 * Look until we find a better one, run out of
+				 * space, or run off the end.
+				 */
+				while (bno_cur_lt && bno_cur_gt) {
+					if ((error = xfs_alloc_get_rec(
+							bno_cur_lt, &ltbno,
+							&ltlen, &i)))
+						goto error0;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+					xfs_alloc_compute_aligned(ltbno, ltlen,
+						args->alignment, args->minlen,
+						&ltbnoa, &ltlena);
+					/*
+					 * The right one is clearly better.
+					 */
+					if (ltbnoa <= args->agbno - gtdiff) {
+						xfs_btree_del_cursor(
+							bno_cur_lt,
+							XFS_BTREE_NOERROR);
+						bno_cur_lt = NULL;
+						break;
+					}
+					/*
+					 * If we reach a big enough entry,
+					 * compare the two and pick the best.
+					 */
+					if (ltlena >= args->minlen) {
+						args->len = XFS_EXTLEN_MIN(
+							ltlena, args->maxlen);
+						xfs_alloc_fix_len(args);
+						rlen = args->len;
+						ltdiff = xfs_alloc_compute_diff(
+							args->agbno, rlen,
+							args->alignment,
+							ltbno, ltlen, &ltnew);
+						/*
+						 * Left side is better.
+						 */
+						if (ltdiff < gtdiff) {
+							xfs_btree_del_cursor(
+								bno_cur_gt,
+								XFS_BTREE_NOERROR);
+							bno_cur_gt = NULL;
+						}
+						/*
+						 * Right side is better.
+						 */
+						else {
+							xfs_btree_del_cursor(
+								bno_cur_lt,
+								XFS_BTREE_NOERROR);
+							bno_cur_lt = NULL;
+						}
+						break;
+					}
+					/*
+					 * Fell off the left end.
+					 */
+					if ((error = xfs_btree_decrement(
+							bno_cur_lt, 0, &i)))
+						goto error0;
+					if (!i) {
+						xfs_btree_del_cursor(bno_cur_lt,
+							XFS_BTREE_NOERROR);
+						bno_cur_lt = NULL;
+						break;
+					}
+				}
+			}
+			/*
+			 * The right side is perfect, trash the left side.
+			 */
+			else {
+				xfs_btree_del_cursor(bno_cur_lt,
+					XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
 		}
-
-		if (error)
-			goto error0;
 	}
-
 	/*
 	 * If we couldn't get anything, give up.
 	 */
@@ -1067,7 +1130,6 @@ xfs_alloc_ag_vextent_near(
 		args->agbno = NULLAGBLOCK;
 		return 0;
 	}
-
 	/*
 	 * At this point we have selected a freespace entry, either to the
 	 * left or to the right.  If it's on the right, copy all the
@@ -1084,7 +1146,6 @@ xfs_alloc_ag_vextent_near(
 		j = 1;
 	} else
 		j = 0;
-
 	/*
 	 * Fix up the length and compute the useful address.
 	 */
diff --git a/trunk/fs/xfs/xfs_attr_leaf.c b/trunk/fs/xfs/xfs_attr_leaf.c
index 71e90dc2aeb1..a6cff8edcdb6 100644
--- a/trunk/fs/xfs/xfs_attr_leaf.c
+++ b/trunk/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
 	sbsize = sf->hdr.count * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
 
 	/*
 	 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 				args.dp = context->dp;
 				args.whichfork = XFS_ATTR_FORK;
 				args.valuelen = valuelen;
-				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+				args.value = kmem_alloc(valuelen, KM_SLEEP);
 				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
 				args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
 				retval = xfs_attr_rmtval_get(&args);
diff --git a/trunk/fs/xfs/xfs_btree.c b/trunk/fs/xfs/xfs_btree.c
index 2f9e97c128a0..04f9cca8da7e 100644
--- a/trunk/fs/xfs/xfs_btree.c
+++ b/trunk/fs/xfs/xfs_btree.c
@@ -634,8 +634,9 @@ xfs_btree_read_bufl(
 		return error;
 	}
 	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-	if (bp)
+	if (bp != NULL) {
 		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+	}
 	*bpp = bp;
 	return 0;
 }
@@ -943,13 +944,13 @@ xfs_btree_set_refs(
 	switch (cur->bc_btnum) {
 	case XFS_BTNUM_BNO:
 	case XFS_BTNUM_CNT:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
 		break;
 	case XFS_BTNUM_INO:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
 		break;
 	case XFS_BTNUM_BMAP:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
 		break;
 	default:
 		ASSERT(0);
diff --git a/trunk/fs/xfs/xfs_buf_item.c b/trunk/fs/xfs/xfs_buf_item.c
index ed2b65f3f8b9..2686d0d54c5b 100644
--- a/trunk/fs/xfs/xfs_buf_item.c
+++ b/trunk/fs/xfs/xfs_buf_item.c
@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
 #endif
 
 STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
+STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 
 /*
  * This returns the number of log iovecs needed to log the
@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
 		 * xfs_trans_ail_delete() drops the AIL lock.
 		 */
 		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-			xfs_buf_do_callbacks(bp);
+			xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
 			XFS_BUF_CLR_IODONE_FUNC(bp);
 		} else {
@@ -918,26 +918,15 @@ xfs_buf_attach_iodone(
 	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
 
-/*
- * We can have many callbacks on a buffer. Running the callbacks individually
- * can cause a lot of contention on the AIL lock, so we allow for a single
- * callback to be able to scan the remaining lip->li_bio_list for other items
- * of the same type and callback to be processed in the first call.
- *
- * As a result, the loop walking the callback list below will also modify the
- * list. it removes the first item from the list and then runs the callback.
- * The loop then restarts from the new head of the list. This allows the
- * callback to scan and modify the list attached to the buffer and we don't
- * have to care about maintaining a next item pointer.
- */
 STATIC void
 xfs_buf_do_callbacks(
-	struct xfs_buf		*bp)
+	xfs_buf_t	*bp,
+	xfs_log_item_t	*lip)
 {
-	struct xfs_log_item	*lip;
+	xfs_log_item_t	*nlip;
 
-	while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-		XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
+	while (lip != NULL) {
+		nlip = lip->li_bio_list;
 		ASSERT(lip->li_cb != NULL);
 		/*
 		 * Clear the next pointer so we don't have any
@@ -947,6 +936,7 @@ xfs_buf_do_callbacks(
 		 */
 		lip->li_bio_list = NULL;
 		lip->li_cb(bp, lip);
+		lip = nlip;
 	}
 }
 
@@ -980,7 +970,7 @@ xfs_buf_iodone_callbacks(
 			ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
 			XFS_BUF_SUPER_STALE(bp);
 			trace_xfs_buf_item_iodone(bp, _RET_IP_);
-			xfs_buf_do_callbacks(bp);
+			xfs_buf_do_callbacks(bp, lip);
 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
 			XFS_BUF_CLR_IODONE_FUNC(bp);
 			xfs_buf_ioend(bp, 0);
@@ -1039,7 +1029,7 @@ xfs_buf_iodone_callbacks(
 		return;
 	}
 
-	xfs_buf_do_callbacks(bp);
+	xfs_buf_do_callbacks(bp, lip);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
 	xfs_buf_ioend(bp, 0);
@@ -1073,7 +1063,7 @@ xfs_buf_error_relse(
 	 * We have to unpin the pinned buffers so do the
 	 * callbacks.
 	 */
-	xfs_buf_do_callbacks(bp);
+	xfs_buf_do_callbacks(bp, lip);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
 	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
diff --git a/trunk/fs/xfs/xfs_buf_item.h b/trunk/fs/xfs/xfs_buf_item.h
index b6ecd2061e7c..0e2ed43f16c7 100644
--- a/trunk/fs/xfs/xfs_buf_item.h
+++ b/trunk/fs/xfs/xfs_buf_item.h
@@ -105,6 +105,17 @@ typedef struct xfs_buf_log_item {
 	xfs_buf_log_format_t	bli_format;	/* in-log header */
 } xfs_buf_log_item_t;
 
+/*
+ * This structure is used during recovery to record the buf log
+ * items which have been canceled and should not be replayed.
+ */
+typedef struct xfs_buf_cancel {
+	xfs_daddr_t		bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct xfs_buf_cancel	*bc_next;
+} xfs_buf_cancel_t;
+
 void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void	xfs_buf_item_relse(struct xfs_buf *);
 void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/trunk/fs/xfs/xfs_extfree_item.c b/trunk/fs/xfs/xfs_extfree_item.c
index 75f2ef60e579..a55e687bf562 100644
--- a/trunk/fs/xfs/xfs_extfree_item.c
+++ b/trunk/fs/xfs/xfs_extfree_item.c
@@ -47,28 +47,6 @@ xfs_efi_item_free(
 		kmem_zone_free(xfs_efi_zone, efip);
 }
 
-/*
- * Freeing the efi requires that we remove it from the AIL if it has already
- * been placed there. However, the EFI may not yet have been placed in the AIL
- * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the
- * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
- * the EFI.
- */
-STATIC void
-__xfs_efi_release(
-	struct xfs_efi_log_item	*efip)
-{
-	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
-
-	if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
-		spin_lock(&ailp->xa_lock);
-		/* xfs_trans_ail_delete() drops the AIL lock. */
-		xfs_trans_ail_delete(ailp, &efip->efi_item);
-		xfs_efi_item_free(efip);
-	}
-}
-
 /*
  * This returns the number of iovecs needed to log the given efi item.
  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
@@ -96,8 +74,7 @@ xfs_efi_item_format(
 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
 	uint			size;
 
-	ASSERT(atomic_read(&efip->efi_next_extent) ==
-				efip->efi_format.efi_nextents);
+	ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
 
 	efip->efi_format.efi_type = XFS_LI_EFI;
 
@@ -122,12 +99,10 @@ xfs_efi_item_pin(
 }
 
 /*
- * While EFIs cannot really be pinned, the unpin operation is the last place at
- * which the EFI is manipulated during a transaction.  If we are being asked to
- * remove the EFI it's because the transaction has been cancelled and by
- * definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
- * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
+ * While EFIs cannot really be pinned, the unpin operation is the
+ * last place at which the EFI is manipulated during a transaction.
+ * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * free the EFI.
  */
 STATIC void
 xfs_efi_item_unpin(
@@ -135,14 +110,20 @@ xfs_efi_item_unpin(
 	int			remove)
 {
 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	spin_lock(&ailp->xa_lock);
+	if (efip->efi_flags & XFS_EFI_CANCELED) {
+		if (remove)
+			xfs_trans_del_item(lip);
 
-	if (remove) {
-		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-		xfs_trans_del_item(lip);
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		xfs_trans_ail_delete(ailp, lip);
 		xfs_efi_item_free(efip);
-		return;
+	} else {
+		efip->efi_flags |= XFS_EFI_COMMITTED;
+		spin_unlock(&ailp->xa_lock);
 	}
-	__xfs_efi_release(efip);
 }
 
 /*
@@ -171,20 +152,16 @@ xfs_efi_item_unlock(
 }
 
 /*
- * The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.  For bulk transaction committed
- * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
- * when processing the EFD.
+ * The EFI is logged only once and cannot be moved in the log, so
+ * simply return the lsn at which it's been logged.  The canceled
+ * flag is not paid any attention here.  Checking for that is delayed
+ * until the EFI is unpinned.
  */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
-	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
-
-	set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
 	return lsn;
 }
 
@@ -253,7 +230,6 @@ xfs_efi_init(
 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
 	efip->efi_format.efi_nextents = nextents;
 	efip->efi_format.efi_id = (__psint_t)(void*)efip;
-	atomic_set(&efip->efi_next_extent, 0);
 
 	return efip;
 }
@@ -313,18 +289,37 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 
 /*
- * This is called by the efd item code below to release references to the given
- * efi item.  Each efd calls this with the number of extents that it has
- * logged, and when the sum of these reaches the total number of extents logged
- * by this efi item we can free the efi item.
+ * This is called by the efd item code below to release references to
+ * the given efi item.  Each efd calls this with the number of
+ * extents that it has logged, and when the sum of these reaches
+ * the total number of extents logged by this efi item we can free
+ * the efi item.
+ *
+ * Freeing the efi item requires that we remove it from the AIL.
+ * We'll use the AIL lock to protect our counters as well as
+ * the removal from the AIL.
  */
 void
 xfs_efi_release(xfs_efi_log_item_t	*efip,
 		uint			nextents)
 {
-	ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-	if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
-		__xfs_efi_release(efip);
+	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
+	int			extents_left;
+
+	ASSERT(efip->efi_next_extent > 0);
+	ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
+
+	spin_lock(&ailp->xa_lock);
+	ASSERT(efip->efi_next_extent >= nextents);
+	efip->efi_next_extent -= nextents;
+	extents_left = efip->efi_next_extent;
+	if (extents_left == 0) {
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
+		xfs_efi_item_free(efip);
+	} else {
+		spin_unlock(&ailp->xa_lock);
+	}
 }
 
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/trunk/fs/xfs/xfs_extfree_item.h b/trunk/fs/xfs/xfs_extfree_item.h
index 375f68e42531..0d22c56fdf64 100644
--- a/trunk/fs/xfs/xfs_extfree_item.h
+++ b/trunk/fs/xfs/xfs_extfree_item.h
@@ -111,10 +111,11 @@ typedef struct xfs_efd_log_format_64 {
 #define	XFS_EFI_MAX_FAST_EXTENTS	16
 
 /*
- * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
+ * Define EFI flags.
  */
-#define	XFS_EFI_RECOVERED	1
-#define	XFS_EFI_COMMITTED	2
+#define	XFS_EFI_RECOVERED	0x1
+#define	XFS_EFI_COMMITTED	0x2
+#define	XFS_EFI_CANCELED	0x4
 
 /*
  * This is the "extent free intention" log item.  It is used
@@ -124,8 +125,8 @@ typedef struct xfs_efd_log_format_64 {
  */
 typedef struct xfs_efi_log_item {
 	xfs_log_item_t		efi_item;
-	atomic_t		efi_next_extent;
-	unsigned long		efi_flags;	/* misc flags */
+	uint			efi_flags;	/* misc flags */
+	uint			efi_next_extent;
 	xfs_efi_log_format_t	efi_format;
 } xfs_efi_log_item_t;
 
diff --git a/trunk/fs/xfs/xfs_fsops.c b/trunk/fs/xfs/xfs_fsops.c
index f56d30e8040c..a7c116e814af 100644
--- a/trunk/fs/xfs/xfs_fsops.c
+++ b/trunk/fs/xfs/xfs_fsops.c
@@ -374,7 +374,6 @@ xfs_growfs_data_private(
 		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
 	} else
 		mp->m_maxicount = 0;
-	xfs_set_low_space_thresholds(mp);
 
 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
diff --git a/trunk/fs/xfs/xfs_iget.c b/trunk/fs/xfs/xfs_iget.c
index cb9b6d1469f7..d7de5a3f7867 100644
--- a/trunk/fs/xfs/xfs_iget.c
+++ b/trunk/fs/xfs/xfs_iget.c
@@ -42,17 +42,6 @@
 #include "xfs_trace.h"
 
 
-/*
- * Define xfs inode iolock lockdep classes. We need to ensure that all active
- * inodes are considered the same for lockdep purposes, including inodes that
- * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
- * guarantee the locks are considered the same when there are multiple lock
- * initialisation siteѕ. Also, define a reclaimable inode class so it is
- * obvious in lockdep reports which class the report is against.
- */
-static struct lock_class_key xfs_iolock_active;
-struct lock_class_key xfs_iolock_reclaimable;
-
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -80,11 +69,8 @@ xfs_inode_alloc(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
-	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
-			&xfs_iolock_active, "xfs_iolock_active");
 
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
@@ -99,6 +85,9 @@ xfs_inode_alloc(
 	ip->i_size = 0;
 	ip->i_new_size = 0;
 
+	/* prevent anyone from using this yet */
+	VFS_I(ip)->i_state = I_NEW;
+
 	return ip;
 }
 
@@ -156,18 +145,7 @@ xfs_inode_free(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
-	/*
-	 * Because we use RCU freeing we need to ensure the inode always
-	 * appears to be reclaimed with an invalid inode number when in the
-	 * free state. The ip->i_flags_lock provides the barrier against lookup
-	 * races.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	ip->i_flags = XFS_IRECLAIM;
-	ip->i_ino = 0;
-	spin_unlock(&ip->i_flags_lock);
-
-	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+	call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
 }
 
 /*
@@ -177,29 +155,14 @@ static int
 xfs_iget_cache_hit(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip,
-	xfs_ino_t		ino,
 	int			flags,
-	int			lock_flags) __releases(RCU)
+	int			lock_flags) __releases(pag->pag_ici_lock)
 {
 	struct inode		*inode = VFS_I(ip);
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
 
-	/*
-	 * check for re-use of an inode within an RCU grace period due to the
-	 * radix tree nodes not being updated yet. We monitor for this by
-	 * setting the inode number to zero before freeing the inode structure.
-	 * If the inode has been reallocated and set up, then the inode number
-	 * will not match, so check for that, too.
-	 */
 	spin_lock(&ip->i_flags_lock);
-	if (ip->i_ino != ino) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
 
 	/*
 	 * If we are racing with another cache hit that is currently
@@ -242,7 +205,7 @@ xfs_iget_cache_hit(
 		ip->i_flags |= XFS_IRECLAIM;
 
 		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
+		read_unlock(&pag->pag_ici_lock);
 
 		error = -inode_init_always(mp->m_super, inode);
 		if (error) {
@@ -250,7 +213,7 @@ xfs_iget_cache_hit(
 			 * Re-initializing the inode failed, and we are in deep
 			 * trouble.  Try to re-add it to the reclaim list.
 			 */
-			rcu_read_lock();
+			read_lock(&pag->pag_ici_lock);
 			spin_lock(&ip->i_flags_lock);
 
 			ip->i_flags &= ~XFS_INEW;
@@ -260,20 +223,14 @@ xfs_iget_cache_hit(
 			goto out_error;
 		}
 
-		spin_lock(&pag->pag_ici_lock);
+		write_lock(&pag->pag_ici_lock);
 		spin_lock(&ip->i_flags_lock);
 		ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
 		ip->i_flags |= XFS_INEW;
 		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
 		inode->i_state = I_NEW;
-
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-		lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
-				&xfs_iolock_active, "xfs_iolock_active");
-
 		spin_unlock(&ip->i_flags_lock);
-		spin_unlock(&pag->pag_ici_lock);
+		write_unlock(&pag->pag_ici_lock);
 	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
@@ -284,7 +241,7 @@ xfs_iget_cache_hit(
 
 		/* We've got a live one. */
 		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
+		read_unlock(&pag->pag_ici_lock);
 		trace_xfs_iget_hit(ip);
 	}
 
@@ -298,7 +255,7 @@ xfs_iget_cache_hit(
 
 out_error:
 	spin_unlock(&ip->i_flags_lock);
-	rcu_read_unlock();
+	read_unlock(&pag->pag_ici_lock);
 	return error;
 }
 
@@ -351,7 +308,7 @@ xfs_iget_cache_miss(
 			BUG();
 	}
 
-	spin_lock(&pag->pag_ici_lock);
+	write_lock(&pag->pag_ici_lock);
 
 	/* insert the new inode */
 	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -366,14 +323,14 @@ xfs_iget_cache_miss(
 	ip->i_udquot = ip->i_gdquot = NULL;
 	xfs_iflags_set(ip, XFS_INEW);
 
-	spin_unlock(&pag->pag_ici_lock);
+	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
 
 	*ipp = ip;
 	return 0;
 
 out_preload_end:
-	spin_unlock(&pag->pag_ici_lock);
+	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
@@ -420,7 +377,7 @@ xfs_iget(
 	xfs_agino_t	agino;
 
 	/* reject inode numbers outside existing AGs */
-	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
@@ -429,15 +386,15 @@ xfs_iget(
 
 again:
 	error = 0;
-	rcu_read_lock();
+	read_lock(&pag->pag_ici_lock);
 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
 	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
 		if (error)
 			goto out_error_or_again;
 	} else {
-		rcu_read_unlock();
+		read_unlock(&pag->pag_ici_lock);
 		XFS_STATS_INC(xs_ig_missed);
 
 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/trunk/fs/xfs/xfs_inode.c b/trunk/fs/xfs/xfs_inode.c
index be7cf625421f..108c7a085f94 100644
--- a/trunk/fs/xfs/xfs_inode.c
+++ b/trunk/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
 	 * around for a while.  This helps to keep recently accessed
 	 * meta-data in-core longer.
 	 */
-	xfs_buf_set_ref(bp, XFS_INO_REF);
+	XFS_BUF_SET_REF(bp, XFS_INO_REF);
 
 	/*
 	 * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,32 +2000,16 @@ xfs_ifree_cluster(
 		 */
 		for (i = 0; i < ninodes; i++) {
 retry:
-			rcu_read_lock();
+			read_lock(&pag->pag_ici_lock);
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
 
-			/* Inode not in memory, nothing to do */
-			if (!ip) {
-				rcu_read_unlock();
+			/* Inode not in memory or stale, nothing to do */
+			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
 
-			/*
-			 * because this is an RCU protected lookup, we could
-			 * find a recently freed or even reallocated inode
-			 * during the lookup. We need to check under the
-			 * i_flags_lock for a valid inode here. Skip it if it
-			 * is not valid, the wrong inode or stale.
-			 */
-			spin_lock(&ip->i_flags_lock);
-			if (ip->i_ino != inum + i ||
-			    __xfs_iflags_test(ip, XFS_ISTALE)) {
-				spin_unlock(&ip->i_flags_lock);
-				rcu_read_unlock();
-				continue;
-			}
-			spin_unlock(&ip->i_flags_lock);
-
 			/*
 			 * Don't try to lock/unlock the current inode, but we
 			 * _cannot_ skip the other inodes that we did not find
@@ -2035,11 +2019,11 @@ xfs_ifree_cluster(
 			 */
 			if (ip != free_ip &&
 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				rcu_read_unlock();
+				read_unlock(&pag->pag_ici_lock);
 				delay(1);
 				goto retry;
 			}
-			rcu_read_unlock();
+			read_unlock(&pag->pag_ici_lock);
 
 			xfs_iflock(ip);
 			xfs_iflags_set(ip, XFS_ISTALE);
@@ -2645,7 +2629,7 @@ xfs_iflush_cluster(
 
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-	rcu_read_lock();
+	read_lock(&pag->pag_ici_lock);
 	/* really need a gang lookup range call here */
 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
 					first_index, inodes_per_cluster);
@@ -2656,21 +2640,9 @@ xfs_iflush_cluster(
 		iq = ilist[i];
 		if (iq == ip)
 			continue;
-
-		/*
-		 * because this is an RCU protected lookup, we could find a
-		 * recently freed or even reallocated inode during the lookup.
-		 * We need to check under the i_flags_lock for a valid inode
-		 * here. Skip it if it is not valid or the wrong inode.
-		 */
-		spin_lock(&ip->i_flags_lock);
-		if (!ip->i_ino ||
-		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
-			spin_unlock(&ip->i_flags_lock);
-			continue;
-		}
-		spin_unlock(&ip->i_flags_lock);
-
+		/* if the inode lies outside this cluster, we're done. */
+		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+			break;
 		/*
 		 * Do an un-protected check to see if the inode is dirty and
 		 * is a candidate for flushing.  These checks will be repeated
@@ -2720,7 +2692,7 @@ xfs_iflush_cluster(
 	}
 
 out_free:
-	rcu_read_unlock();
+	read_unlock(&pag->pag_ici_lock);
 	kmem_free(ilist);
 out_put:
 	xfs_perag_put(pag);
@@ -2732,7 +2704,7 @@ xfs_iflush_cluster(
 	 * Corruption detected in the clustering loop.  Invalidate the
 	 * inode buffer and shut down the filesystem.
 	 */
-	rcu_read_unlock();
+	read_unlock(&pag->pag_ici_lock);
 	/*
 	 * Clean up the buffer.  If it was B_DELWRI, just release it --
 	 * brelse can handle it with no problems.  If not, shut down the
diff --git a/trunk/fs/xfs/xfs_inode.h b/trunk/fs/xfs/xfs_inode.h
index 5c95fa8ec11d..fb2ca2e4cdc9 100644
--- a/trunk/fs/xfs/xfs_inode.h
+++ b/trunk/fs/xfs/xfs_inode.h
@@ -376,13 +376,12 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE		0x0002	/* inode has been staled */
-#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
-#define XFS_INEW		0x0008	/* inode has just been allocated */
-#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
-#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
+#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_ISTALE	0x0002	/* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_INEW	0x0008	/* inode has just been allocated */
+#define XFS_IFILESTREAM	0x0010	/* inode is in a filestream directory */
+#define XFS_ITRUNCATED	0x0020	/* truncated down so flush-on-close */
 
 /*
  * Flags for inode locking.
@@ -439,8 +438,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 
-extern struct lock_class_key xfs_iolock_reclaimable;
-
 /*
  * Flags for xfs_itruncate_start().
  */
diff --git a/trunk/fs/xfs/xfs_inode_item.c b/trunk/fs/xfs/xfs_inode_item.c
index fd4f398bd6f1..7c8d30c453c3 100644
--- a/trunk/fs/xfs/xfs_inode_item.c
+++ b/trunk/fs/xfs/xfs_inode_item.c
@@ -842,64 +842,15 @@ xfs_inode_item_destroy(
  * flushed to disk.  It is responsible for removing the inode item
  * from the AIL if it has not been re-logged, and unlocking the inode's
  * flush lock.
- *
- * To reduce AIL lock traffic as much as possible, we scan the buffer log item
- * list for other inodes that will run this function. We remove them from the
- * buffer list so we can process all the inode IO completions in one AIL lock
- * traversal.
  */
 void
 xfs_iflush_done(
 	struct xfs_buf		*bp,
 	struct xfs_log_item	*lip)
 {
-	struct xfs_inode_log_item *iip;
-	struct xfs_log_item	*blip;
-	struct xfs_log_item	*next;
-	struct xfs_log_item	*prev;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	xfs_inode_t		*ip = iip->ili_inode;
 	struct xfs_ail		*ailp = lip->li_ailp;
-	int			need_ail = 0;
-
-	/*
-	 * Scan the buffer IO completions for other inodes being completed and
-	 * attach them to the current inode log item.
-	 */
-	blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-	prev = NULL;
-	while (blip != NULL) {
-		if (lip->li_cb != xfs_iflush_done) {
-			prev = blip;
-			blip = blip->li_bio_list;
-			continue;
-		}
-
-		/* remove from list */
-		next = blip->li_bio_list;
-		if (!prev) {
-			XFS_BUF_SET_FSPRIVATE(bp, next);
-		} else {
-			prev->li_bio_list = next;
-		}
-
-		/* add to current list */
-		blip->li_bio_list = lip->li_bio_list;
-		lip->li_bio_list = blip;
-
-		/*
-		 * while we have the item, do the unlocked check for needing
-		 * the AIL lock.
-		 */
-		iip = INODE_ITEM(blip);
-		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
-			need_ail++;
-
-		blip = next;
-	}
-
-	/* make sure we capture the state of the initial inode. */
-	iip = INODE_ITEM(lip);
-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
-		need_ail++;
 
 	/*
 	 * We only want to pull the item from the AIL if it is
@@ -910,37 +861,28 @@ xfs_iflush_done(
 	 * the lock since it's cheaper, and then we recheck while
 	 * holding the lock before removing the inode from the AIL.
 	 */
-	if (need_ail) {
-		struct xfs_log_item *log_items[need_ail];
-		int i = 0;
+	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
 		spin_lock(&ailp->xa_lock);
-		for (blip = lip; blip; blip = blip->li_bio_list) {
-			iip = INODE_ITEM(blip);
-			if (iip->ili_logged &&
-			    blip->li_lsn == iip->ili_flush_lsn) {
-				log_items[i++] = blip;
-			}
-			ASSERT(i <= need_ail);
+		if (lip->li_lsn == iip->ili_flush_lsn) {
+			/* xfs_trans_ail_delete() drops the AIL lock. */
+			xfs_trans_ail_delete(ailp, lip);
+		} else {
+			spin_unlock(&ailp->xa_lock);
 		}
-		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
-		xfs_trans_ail_delete_bulk(ailp, log_items, i);
 	}
 
+	iip->ili_logged = 0;
 
 	/*
-	 * clean up and unlock the flush lock now we are done. We can clear the
-	 * ili_last_fields bits now that we know that the data corresponding to
-	 * them is safely on disk.
+	 * Clear the ili_last_fields bits now that we know that the
+	 * data corresponding to them is safely on disk.
 	 */
-	for (blip = lip; blip; blip = next) {
-		next = blip->li_bio_list;
-		blip->li_bio_list = NULL;
+	iip->ili_last_fields = 0;
 
-		iip = INODE_ITEM(blip);
-		iip->ili_logged = 0;
-		iip->ili_last_fields = 0;
-		xfs_ifunlock(iip->ili_inode);
-	}
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
 }
 
 /*
diff --git a/trunk/fs/xfs/xfs_iomap.c b/trunk/fs/xfs/xfs_iomap.c
index 55582bd66659..20576146369f 100644
--- a/trunk/fs/xfs/xfs_iomap.c
+++ b/trunk/fs/xfs/xfs_iomap.c
@@ -47,8 +47,127 @@
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
+#define XFS_STRAT_WRITE_IMAPS	2
 #define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
 
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+				  int, struct xfs_bmbt_irec *, int *);
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
+				 struct xfs_bmbt_irec *, int *);
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+				struct xfs_bmbt_irec *, int *);
+
+int
+xfs_iomap(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	ssize_t			count,
+	int			flags,
+	struct xfs_bmbt_irec	*imap,
+	int			*nimaps,
+	int			*new)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			lockmode = 0;
+	int			bmapi_flags = 0;
+
+	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+
+	*new = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
+
+	switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
+	case BMAPI_READ:
+		lockmode = xfs_ilock_map_shared(ip);
+		bmapi_flags = XFS_BMAPI_ENTIRE;
+		break;
+	case BMAPI_WRITE:
+		lockmode = XFS_ILOCK_EXCL;
+		if (flags & BMAPI_IGNSTATE)
+			bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
+		xfs_ilock(ip, lockmode);
+		break;
+	case BMAPI_ALLOCATE:
+		lockmode = XFS_ILOCK_SHARED;
+		bmapi_flags = XFS_BMAPI_ENTIRE;
+
+		/* Attempt non-blocking lock */
+		if (flags & BMAPI_TRYLOCK) {
+			if (!xfs_ilock_nowait(ip, lockmode))
+				return XFS_ERROR(EAGAIN);
+		} else {
+			xfs_ilock(ip, lockmode);
+		}
+		break;
+	default:
+		BUG();
+	}
+
+	ASSERT(offset <= mp->m_maxioffset);
+	if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
+		count = mp->m_maxioffset - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	error = xfs_bmapi(NULL, ip, offset_fsb,
+			(xfs_filblks_t)(end_fsb - offset_fsb),
+			bmapi_flags,  NULL, 0, imap,
+			nimaps, NULL);
+
+	if (error)
+		goto out;
+
+	switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
+	case BMAPI_WRITE:
+		/* If we found an extent, return it */
+		if (*nimaps &&
+		    (imap->br_startblock != HOLESTARTBLOCK) &&
+		    (imap->br_startblock != DELAYSTARTBLOCK)) {
+			trace_xfs_iomap_found(ip, offset, count, flags, imap);
+			break;
+		}
+
+		if (flags & BMAPI_DIRECT) {
+			error = xfs_iomap_write_direct(ip, offset, count, flags,
+						       imap, nimaps);
+		} else {
+			error = xfs_iomap_write_delay(ip, offset, count, flags,
+						      imap, nimaps);
+		}
+		if (!error) {
+			trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
+		}
+		*new = 1;
+		break;
+	case BMAPI_ALLOCATE:
+		/* If we found an extent, return it */
+		xfs_iunlock(ip, lockmode);
+		lockmode = 0;
+
+		if (*nimaps && !isnullstartblock(imap->br_startblock)) {
+			trace_xfs_iomap_found(ip, offset, count, flags, imap);
+			break;
+		}
+
+		error = xfs_iomap_write_allocate(ip, offset, count,
+						 imap, nimaps);
+		break;
+	}
+
+	ASSERT(*nimaps <= 1);
+
+out:
+	if (lockmode)
+		xfs_iunlock(ip, lockmode);
+	return XFS_ERROR(error);
+}
+
 STATIC int
 xfs_iomap_eof_align_last_fsb(
 	xfs_mount_t	*mp,
@@ -117,13 +236,14 @@ xfs_cmn_err_fsblock_zero(
 	return EFSCORRUPTED;
 }
 
-int
+STATIC int
 xfs_iomap_write_direct(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
+	int		flags,
 	xfs_bmbt_irec_t *imap,
-	int		nmaps)
+	int		*nmaps)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -159,7 +279,7 @@ xfs_iomap_write_direct(
 		if (error)
 			goto error_out;
 	} else {
-		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+		if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
 					imap->br_blockcount +
 					imap->br_startoff);
@@ -211,7 +331,7 @@ xfs_iomap_write_direct(
 	xfs_trans_ijoin(tp, ip);
 
 	bmapi_flag = XFS_BMAPI_WRITE;
-	if (offset < ip->i_size || extsz)
+	if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
@@ -250,6 +370,7 @@ xfs_iomap_write_direct(
 		goto error_out;
 	}
 
+	*nmaps = 1;
 	return 0;
 
 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -258,6 +379,7 @@ xfs_iomap_write_direct(
 
 error1:	/* Just cancel transaction */
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	*nmaps = 0;	/* nothing set-up here */
 
 error_out:
 	return XFS_ERROR(error);
@@ -267,9 +389,6 @@ xfs_iomap_write_direct(
  * If the caller is doing a write at the end of the file, then extend the
  * allocation out to the file system's write iosize.  We clean up any extra
  * space left over when the file is closed in xfs_inactive().
- *
- * If we find we already have delalloc preallocation beyond EOF, don't do more
- * preallocation as it it not needed.
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -277,6 +396,7 @@ xfs_iomap_eof_want_preallocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
+	int		ioflag,
 	xfs_bmbt_irec_t *imap,
 	int		nimaps,
 	int		*prealloc)
@@ -285,7 +405,6 @@ xfs_iomap_eof_want_preallocate(
 	xfs_filblks_t   count_fsb;
 	xfs_fsblock_t	firstblock;
 	int		n, error, imaps;
-	int		found_delalloc = 0;
 
 	*prealloc = 0;
 	if ((offset + count) <= ip->i_size)
@@ -310,66 +429,20 @@ xfs_iomap_eof_want_preallocate(
 				return 0;
 			start_fsb += imap[n].br_blockcount;
 			count_fsb -= imap[n].br_blockcount;
-
-			if (imap[n].br_startblock == DELAYSTARTBLOCK)
-				found_delalloc = 1;
 		}
 	}
-	if (!found_delalloc)
-		*prealloc = 1;
+	*prealloc = 1;
 	return 0;
 }
 
-/*
- * If we don't have a user specified preallocation size, dynamically increase
- * the preallocation size as the size of the file grows. Cap the maximum size
- * at a single extent or less if the filesystem is near full. The closer the
- * filesystem is to full, the smaller the maximum prealocation.
- */
-STATIC xfs_fsblock_t
-xfs_iomap_prealloc_size(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*ip)
-{
-	xfs_fsblock_t		alloc_blocks = 0;
-
-	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
-		int shift = 0;
-		int64_t freesp;
-
-		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
-		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
-					rounddown_pow_of_two(alloc_blocks));
-
-		xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
-		freesp = mp->m_sb.sb_fdblocks;
-		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
-			shift = 2;
-			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
-				shift++;
-			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
-				shift++;
-			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
-				shift++;
-			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
-				shift++;
-		}
-		if (shift)
-			alloc_blocks >>= shift;
-	}
-
-	if (alloc_blocks < mp->m_writeio_blocks)
-		alloc_blocks = mp->m_writeio_blocks;
-
-	return alloc_blocks;
-}
-
-int
+STATIC int
 xfs_iomap_write_delay(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	xfs_bmbt_irec_t *ret_imap)
+	int		ioflag,
+	xfs_bmbt_irec_t *ret_imap,
+	int		*nmaps)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -396,19 +469,16 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-				imap, XFS_WRITE_IMAPS, &prealloc);
+				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
 		return error;
 
 retry:
 	if (prealloc) {
-		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
-
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-		last_fsb = ioalign + alloc_blocks;
+		last_fsb = ioalign + mp->m_writeio_blocks;
 	} else {
 		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
 	}
@@ -426,31 +496,22 @@ xfs_iomap_write_delay(
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
 			  &nimaps, NULL);
-	switch (error) {
-	case 0:
-	case ENOSPC:
-	case EDQUOT:
-		break;
-	default:
+	if (error && (error != ENOSPC))
 		return XFS_ERROR(error);
-	}
 
 	/*
-	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
-	 * some of the excess reserved metadata space. For both cases, retry
-	 * without EOF preallocation.
+	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+	 * then we must have run out of space - flush all other inodes with
+	 * delalloc blocks and retry without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
 		if (flushed)
-			return XFS_ERROR(error ? error : ENOSPC);
+			return XFS_ERROR(ENOSPC);
 
-		if (error == ENOSPC) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inodes(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_flush_inodes(ip);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
 		flushed = 1;
 		error = 0;
@@ -462,6 +523,8 @@ xfs_iomap_write_delay(
 		return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
 
 	*ret_imap = imap[0];
+	*nmaps = 1;
+
 	return 0;
 }
 
@@ -475,12 +538,13 @@ xfs_iomap_write_delay(
  * We no longer bother to look at the incoming map - all we have to
  * guarantee is that whatever we allocate fills the required range.
  */
-int
+STATIC int
 xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	xfs_bmbt_irec_t *imap)
+	xfs_bmbt_irec_t *imap,
+	int		*retmap)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb, last_block;
@@ -493,6 +557,8 @@ xfs_iomap_write_allocate(
 	int		error = 0;
 	int		nres;
 
+	*retmap = 0;
+
 	/*
 	 * Make sure that the dquots are there.
 	 */
@@ -614,6 +680,7 @@ xfs_iomap_write_allocate(
 		if ((offset_fsb >= imap->br_startoff) &&
 		    (offset_fsb < (imap->br_startoff +
 				   imap->br_blockcount))) {
+			*retmap = 1;
 			XFS_STATS_INC(xs_xstrat_quick);
 			return 0;
 		}
diff --git a/trunk/fs/xfs/xfs_iomap.h b/trunk/fs/xfs/xfs_iomap.h
index 80615760959a..7748a430f50d 100644
--- a/trunk/fs/xfs/xfs_iomap.h
+++ b/trunk/fs/xfs/xfs_iomap.h
@@ -18,15 +18,30 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
+/* base extent manipulation calls */
+#define BMAPI_READ	(1 << 0)	/* read extents */
+#define BMAPI_WRITE	(1 << 1)	/* create extents */
+#define BMAPI_ALLOCATE	(1 << 2)	/* delayed allocate to real extents */
+
+/* modifiers */
+#define BMAPI_IGNSTATE	(1 << 4)	/* ignore unwritten state on read */
+#define BMAPI_DIRECT	(1 << 5)	/* direct instead of buffered write */
+#define BMAPI_MMA	(1 << 6)	/* allocate for mmap write */
+#define BMAPI_TRYLOCK	(1 << 7)	/* non-blocking request */
+
+#define BMAPI_FLAGS \
+	{ BMAPI_READ,		"READ" }, \
+	{ BMAPI_WRITE,		"WRITE" }, \
+	{ BMAPI_ALLOCATE,	"ALLOCATE" }, \
+	{ BMAPI_IGNSTATE,	"IGNSTATE" }, \
+	{ BMAPI_DIRECT,		"DIRECT" }, \
+	{ BMAPI_TRYLOCK,	"TRYLOCK" }
+
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-			struct xfs_bmbt_irec *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
-			struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-			struct xfs_bmbt_irec *);
+extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+		     struct xfs_bmbt_irec *, int *, int *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/trunk/fs/xfs/xfs_log.c b/trunk/fs/xfs/xfs_log.c
index 0bf24b11d0c4..cee4ab9f8a9e 100644
--- a/trunk/fs/xfs/xfs_log.c
+++ b/trunk/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 				xfs_buftarg_t	*log_target,
 				xfs_daddr_t	blk_offset,
 				int		num_bblks);
-STATIC int	 xlog_space_left(struct log *log, atomic64_t *head);
+STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void	 xlog_dealloc_log(xlog_t *log);
 
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t		*log,
 				 xlog_ticket_t	*xtic);
-STATIC void xlog_grant_push_ail(struct log	*log,
+STATIC void xlog_grant_push_ail(xfs_mount_t	*mp,
 				int		need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t	 *log,
 					   xlog_ticket_t *ticket);
@@ -81,73 +81,98 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 
 #if defined(DEBUG)
 STATIC void	xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void	xlog_verify_grant_tail(struct log *log);
+STATIC void	xlog_verify_grant_head(xlog_t *log, int equals);
 STATIC void	xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
 				  int count, boolean_t syncing);
 STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 				     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_tail(a)
+#define xlog_verify_grant_head(a,b)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 
 STATIC int	xlog_iclogs_empty(xlog_t *log);
 
+
 static void
-xlog_grant_sub_space(
-	struct log	*log,
-	atomic64_t	*head,
-	int		bytes)
+xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
 {
-	int64_t	head_val = atomic64_read(head);
-	int64_t new, old;
-
-	do {
-		int	cycle, space;
+	if (*qp) {
+		tic->t_next	    = (*qp);
+		tic->t_prev	    = (*qp)->t_prev;
+		(*qp)->t_prev->t_next = tic;
+		(*qp)->t_prev	    = tic;
+	} else {
+		tic->t_prev = tic->t_next = tic;
+		*qp = tic;
+	}
 
-		xlog_crack_grant_head_val(head_val, &cycle, &space);
+	tic->t_flags |= XLOG_TIC_IN_Q;
+}
 
-		space -= bytes;
-		if (space < 0) {
-			space += log->l_logsize;
-			cycle--;
-		}
+static void
+xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+{
+	if (tic == tic->t_next) {
+		*qp = NULL;
+	} else {
+		*qp = tic->t_next;
+		tic->t_next->t_prev = tic->t_prev;
+		tic->t_prev->t_next = tic->t_next;
+	}
 
-		old = head_val;
-		new = xlog_assign_grant_head_val(cycle, space);
-		head_val = atomic64_cmpxchg(head, old, new);
-	} while (head_val != old);
+	tic->t_next = tic->t_prev = NULL;
+	tic->t_flags &= ~XLOG_TIC_IN_Q;
 }
 
 static void
-xlog_grant_add_space(
-	struct log	*log,
-	atomic64_t	*head,
-	int		bytes)
+xlog_grant_sub_space(struct log *log, int bytes)
 {
-	int64_t	head_val = atomic64_read(head);
-	int64_t new, old;
+	log->l_grant_write_bytes -= bytes;
+	if (log->l_grant_write_bytes < 0) {
+		log->l_grant_write_bytes += log->l_logsize;
+		log->l_grant_write_cycle--;
+	}
 
-	do {
-		int		tmp;
-		int		cycle, space;
+	log->l_grant_reserve_bytes -= bytes;
+	if ((log)->l_grant_reserve_bytes < 0) {
+		log->l_grant_reserve_bytes += log->l_logsize;
+		log->l_grant_reserve_cycle--;
+	}
 
-		xlog_crack_grant_head_val(head_val, &cycle, &space);
+}
 
-		tmp = log->l_logsize - space;
-		if (tmp > bytes)
-			space += bytes;
-		else {
-			space = bytes - tmp;
-			cycle++;
-		}
+static void
+xlog_grant_add_space_write(struct log *log, int bytes)
+{
+	int tmp = log->l_logsize - log->l_grant_write_bytes;
+	if (tmp > bytes)
+		log->l_grant_write_bytes += bytes;
+	else {
+		log->l_grant_write_cycle++;
+		log->l_grant_write_bytes = bytes - tmp;
+	}
+}
+
+static void
+xlog_grant_add_space_reserve(struct log *log, int bytes)
+{
+	int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+	if (tmp > bytes)
+		log->l_grant_reserve_bytes += bytes;
+	else {
+		log->l_grant_reserve_cycle++;
+		log->l_grant_reserve_bytes = bytes - tmp;
+	}
+}
 
-		old = head_val;
-		new = xlog_assign_grant_head_val(cycle, space);
-		head_val = atomic64_cmpxchg(head, old, new);
-	} while (head_val != old);
+static inline void
+xlog_grant_add_space(struct log *log, int bytes)
+{
+	xlog_grant_add_space_write(log, bytes);
+	xlog_grant_add_space_reserve(log, bytes);
 }
 
 static void
@@ -330,7 +355,7 @@ xfs_log_reserve(
 
 		trace_xfs_log_reserve(log, internal_ticket);
 
-		xlog_grant_push_ail(log, internal_ticket->t_unit_res);
+		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
 		retval = xlog_regrant_write_log_space(log, internal_ticket);
 	} else {
 		/* may sleep if need to allocate more tickets */
@@ -344,7 +369,7 @@ xfs_log_reserve(
 
 		trace_xfs_log_reserve(log, internal_ticket);
 
-		xlog_grant_push_ail(log,
+		xlog_grant_push_ail(mp,
 				    (internal_ticket->t_unit_res *
 				     internal_ticket->t_cnt));
 		retval = xlog_grant_log_space(log, internal_ticket);
@@ -559,8 +584,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
 		      iclog->ic_state == XLOG_STATE_DIRTY)) {
 			if (!XLOG_FORCED_SHUTDOWN(log)) {
-				xlog_wait(&iclog->ic_force_wait,
-							&log->l_icloglock);
+				sv_wait(&iclog->ic_force_wait, PMEM,
+					&log->l_icloglock, s);
 			} else {
 				spin_unlock(&log->l_icloglock);
 			}
@@ -600,8 +625,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 			|| iclog->ic_state == XLOG_STATE_DIRTY
 			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
 
-				xlog_wait(&iclog->ic_force_wait,
-							&log->l_icloglock);
+				sv_wait(&iclog->ic_force_wait, PMEM,
+					&log->l_icloglock, s);
 		} else {
 			spin_unlock(&log->l_icloglock);
 		}
@@ -678,46 +703,55 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
 	xlog_ticket_t	*tic;
 	xlog_t		*log = mp->m_log;
-	int		need_bytes, free_bytes;
+	int		need_bytes, free_bytes, cycle, bytes;
 
 	if (XLOG_FORCED_SHUTDOWN(log))
 		return;
 
-	if (tail_lsn == 0)
-		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+	if (tail_lsn == 0) {
+		/* needed since sync_lsn is 64 bits */
+		spin_lock(&log->l_icloglock);
+		tail_lsn = log->l_last_sync_lsn;
+		spin_unlock(&log->l_icloglock);
+	}
+
+	spin_lock(&log->l_grant_lock);
 
-	/* tail_lsn == 1 implies that we weren't passed a valid value.  */
-	if (tail_lsn != 1)
-		atomic64_set(&log->l_tail_lsn, tail_lsn);
+	/* Also an invalid lsn.  1 implies that we aren't passing in a valid
+	 * tail_lsn.
+	 */
+	if (tail_lsn != 1) {
+		log->l_tail_lsn = tail_lsn;
+	}
 
-	if (!list_empty_careful(&log->l_writeq)) {
+	if ((tic = log->l_write_headq)) {
 #ifdef DEBUG
 		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
 			panic("Recovery problem");
 #endif
-		spin_lock(&log->l_grant_write_lock);
-		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-		list_for_each_entry(tic, &log->l_writeq, t_queue) {
+		cycle = log->l_grant_write_cycle;
+		bytes = log->l_grant_write_bytes;
+		free_bytes = xlog_space_left(log, cycle, bytes);
+		do {
 			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
 
 			if (free_bytes < tic->t_unit_res && tail_lsn != 1)
 				break;
 			tail_lsn = 0;
 			free_bytes -= tic->t_unit_res;
-			trace_xfs_log_regrant_write_wake_up(log, tic);
-			wake_up(&tic->t_wait);
-		}
-		spin_unlock(&log->l_grant_write_lock);
+			sv_signal(&tic->t_wait);
+			tic = tic->t_next;
+		} while (tic != log->l_write_headq);
 	}
-
-	if (!list_empty_careful(&log->l_reserveq)) {
+	if ((tic = log->l_reserve_headq)) {
 #ifdef DEBUG
 		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
 			panic("Recovery problem");
 #endif
-		spin_lock(&log->l_grant_reserve_lock);
-		free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-		list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+		cycle = log->l_grant_reserve_cycle;
+		bytes = log->l_grant_reserve_bytes;
+		free_bytes = xlog_space_left(log, cycle, bytes);
+		do {
 			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
 				need_bytes = tic->t_unit_res*tic->t_cnt;
 			else
@@ -726,12 +760,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 				break;
 			tail_lsn = 0;
 			free_bytes -= need_bytes;
-			trace_xfs_log_grant_wake_up(log, tic);
-			wake_up(&tic->t_wait);
-		}
-		spin_unlock(&log->l_grant_reserve_lock);
+			sv_signal(&tic->t_wait);
+			tic = tic->t_next;
+		} while (tic != log->l_reserve_headq);
 	}
-}
+	spin_unlock(&log->l_grant_lock);
+}	/* xfs_log_move_tail */
 
 /*
  * Determine if we have a transaction that has gone to disk
@@ -797,19 +831,23 @@ xfs_log_need_covered(xfs_mount_t *mp)
  * We may be holding the log iclog lock upon entering this routine.
  */
 xfs_lsn_t
-xlog_assign_tail_lsn(
-	struct xfs_mount	*mp)
+xlog_assign_tail_lsn(xfs_mount_t *mp)
 {
-	xfs_lsn_t		tail_lsn;
-	struct log		*log = mp->m_log;
+	xfs_lsn_t tail_lsn;
+	xlog_t	  *log = mp->m_log;
 
 	tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-	if (!tail_lsn)
-		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+	spin_lock(&log->l_grant_lock);
+	if (tail_lsn != 0) {
+		log->l_tail_lsn = tail_lsn;
+	} else {
+		tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
+	}
+	spin_unlock(&log->l_grant_lock);
 
-	atomic64_set(&log->l_tail_lsn, tail_lsn);
 	return tail_lsn;
-}
+}	/* xlog_assign_tail_lsn */
+
 
 /*
  * Return the space in the log between the tail and the head.  The head
@@ -826,26 +864,21 @@ xlog_assign_tail_lsn(
  * result is that we return the size of the log as the amount of space left.
  */
 STATIC int
-xlog_space_left(
-	struct log	*log,
-	atomic64_t	*head)
-{
-	int		free_bytes;
-	int		tail_bytes;
-	int		tail_cycle;
-	int		head_cycle;
-	int		head_bytes;
-
-	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
-	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
-	tail_bytes = BBTOB(tail_bytes);
-	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
-		free_bytes = log->l_logsize - (head_bytes - tail_bytes);
-	else if (tail_cycle + 1 < head_cycle)
+xlog_space_left(xlog_t *log, int cycle, int bytes)
+{
+	int free_bytes;
+	int tail_bytes;
+	int tail_cycle;
+
+	tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+	tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+	if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+		free_bytes = log->l_logsize - (bytes - tail_bytes);
+	} else if ((tail_cycle + 1) < cycle) {
 		return 0;
-	else if (tail_cycle < head_cycle) {
-		ASSERT(tail_cycle == (head_cycle - 1));
-		free_bytes = tail_bytes - head_bytes;
+	} else if (tail_cycle < cycle) {
+		ASSERT(tail_cycle == (cycle - 1));
+		free_bytes = tail_bytes - bytes;
 	} else {
 		/*
 		 * The reservation head is behind the tail.
@@ -856,12 +889,12 @@ xlog_space_left(
 			"xlog_space_left: head behind tail\n"
 			"  tail_cycle = %d, tail_bytes = %d\n"
 			"  GH   cycle = %d, GH   bytes = %d",
-			tail_cycle, tail_bytes, head_cycle, head_bytes);
+			tail_cycle, tail_bytes, cycle, bytes);
 		ASSERT(0);
 		free_bytes = log->l_logsize;
 	}
 	return free_bytes;
-}
+}	/* xlog_space_left */
 
 
 /*
@@ -1014,16 +1047,12 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
 
 	log->l_prev_block  = -1;
+	log->l_tail_lsn	   = xlog_assign_lsn(1, 0);
 	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
-	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
+	log->l_last_sync_lsn = log->l_tail_lsn;
 	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
-	xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-	xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
-	INIT_LIST_HEAD(&log->l_reserveq);
-	INIT_LIST_HEAD(&log->l_writeq);
-	spin_lock_init(&log->l_grant_reserve_lock);
-	spin_lock_init(&log->l_grant_write_lock);
+	log->l_grant_reserve_cycle = 1;
+	log->l_grant_write_cycle = 1;
 
 	error = EFSCORRUPTED;
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1065,7 +1094,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_xbuf = bp;
 
 	spin_lock_init(&log->l_icloglock);
-	init_waitqueue_head(&log->l_flush_wait);
+	spin_lock_init(&log->l_grant_lock);
+	sv_init(&log->l_flush_wait, 0, "flush_wait");
 
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1121,8 +1151,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 
 		ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
 		ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-		init_waitqueue_head(&iclog->ic_force_wait);
-		init_waitqueue_head(&iclog->ic_write_wait);
+		sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+		sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
 
 		iclogp = &iclog->ic_next;
 	}
@@ -1137,11 +1167,15 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
 	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
 		prev_iclog = iclog->ic_next;
-		if (iclog->ic_bp)
+		if (iclog->ic_bp) {
+			sv_destroy(&iclog->ic_force_wait);
+			sv_destroy(&iclog->ic_write_wait);
 			xfs_buf_free(iclog->ic_bp);
+		}
 		kmem_free(iclog);
 	}
 	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
 	xfs_buf_free(log->l_xbuf);
 out_free_log:
 	kmem_free(log);
@@ -1189,60 +1223,61 @@ xlog_commit_record(
  * water mark.  In this manner, we would be creating a low water mark.
  */
 STATIC void
-xlog_grant_push_ail(
-	struct log	*log,
-	int		need_bytes)
+xlog_grant_push_ail(xfs_mount_t	*mp,
+		    int		need_bytes)
 {
-	xfs_lsn_t	threshold_lsn = 0;
-	xfs_lsn_t	last_sync_lsn;
-	int		free_blocks;
-	int		free_bytes;
-	int		threshold_block;
-	int		threshold_cycle;
-	int		free_threshold;
-
-	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-
-	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-	free_blocks = BTOBBT(free_bytes);
-
-	/*
-	 * Set the threshold for the minimum number of free blocks in the
-	 * log to the maximum of what the caller needs, one quarter of the
-	 * log, and 256 blocks.
-	 */
-	free_threshold = BTOBB(need_bytes);
-	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-	free_threshold = MAX(free_threshold, 256);
-	if (free_blocks >= free_threshold)
-		return;
-
-	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-						&threshold_block);
-	threshold_block += free_threshold;
+    xlog_t	*log = mp->m_log;	/* pointer to the log */
+    xfs_lsn_t	tail_lsn;		/* lsn of the log tail */
+    xfs_lsn_t	threshold_lsn = 0;	/* lsn we'd like to be at */
+    int		free_blocks;		/* free blocks left to write to */
+    int		free_bytes;		/* free bytes left to write to */
+    int		threshold_block;	/* block in lsn we'd like to be at */
+    int		threshold_cycle;	/* lsn cycle we'd like to be at */
+    int		free_threshold;
+
+    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+    spin_lock(&log->l_grant_lock);
+    free_bytes = xlog_space_left(log,
+				 log->l_grant_reserve_cycle,
+				 log->l_grant_reserve_bytes);
+    tail_lsn = log->l_tail_lsn;
+    free_blocks = BTOBBT(free_bytes);
+
+    /*
+     * Set the threshold for the minimum number of free blocks in the
+     * log to the maximum of what the caller needs, one quarter of the
+     * log, and 256 blocks.
+     */
+    free_threshold = BTOBB(need_bytes);
+    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+    free_threshold = MAX(free_threshold, 256);
+    if (free_blocks < free_threshold) {
+	threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
+	threshold_cycle = CYCLE_LSN(tail_lsn);
 	if (threshold_block >= log->l_logBBsize) {
-		threshold_block -= log->l_logBBsize;
-		threshold_cycle += 1;
+	    threshold_block -= log->l_logBBsize;
+	    threshold_cycle += 1;
 	}
-	threshold_lsn = xlog_assign_lsn(threshold_cycle,
-					threshold_block);
-	/*
-	 * Don't pass in an lsn greater than the lsn of the last
-	 * log record known to be on disk. Use a snapshot of the last sync lsn
-	 * so that it doesn't change between the compare and the set.
-	 */
-	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
-	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
-		threshold_lsn = last_sync_lsn;
+	threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
 
-	/*
-	 * Get the transaction layer to kick the dirty buffers out to
-	 * disk asynchronously. No point in trying to do this if
-	 * the filesystem is shutting down.
+	/* Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk.
 	 */
-	if (!XLOG_FORCED_SHUTDOWN(log))
-		xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}
+	if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+	    threshold_lsn = log->l_last_sync_lsn;
+    }
+    spin_unlock(&log->l_grant_lock);
+
+    /*
+     * Get the transaction layer to kick the dirty buffers out to
+     * disk asynchronously. No point in trying to do this if
+     * the filesystem is shutting down.
+     */
+    if (threshold_lsn &&
+	!XLOG_FORCED_SHUTDOWN(log))
+	    xfs_trans_ail_push(log->l_ailp, threshold_lsn);
+}	/* xlog_grant_push_ail */
 
 /*
  * The bdstrat callback function for log bufs. This gives us a central
@@ -1337,8 +1372,9 @@ xlog_sync(xlog_t		*log,
 		 roundoff < BBTOB(1)));
 
 	/* move grant heads by roundoff in sync */
-	xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-	xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+	spin_lock(&log->l_grant_lock);
+	xlog_grant_add_space(log, roundoff);
+	spin_unlock(&log->l_grant_lock);
 
 	/* put cycle number in every block */
 	xlog_pack_data(log, iclog, roundoff); 
@@ -1453,12 +1489,15 @@ xlog_dealloc_log(xlog_t *log)
 
 	iclog = log->l_iclog;
 	for (i=0; i<log->l_iclog_bufs; i++) {
+		sv_destroy(&iclog->ic_force_wait);
+		sv_destroy(&iclog->ic_write_wait);
 		xfs_buf_free(iclog->ic_bp);
 		next_iclog = iclog->ic_next;
 		kmem_free(iclog);
 		iclog = next_iclog;
 	}
 	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
 
 	xfs_buf_free(log->l_xbuf);
 	log->l_mp->m_log = NULL;
@@ -2193,7 +2232,7 @@ xlog_state_do_callback(
 				lowest_lsn = xlog_get_lowest_lsn(log);
 				if (lowest_lsn &&
 				    XFS_LSN_CMP(lowest_lsn,
-						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+				    		be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
 					iclog = iclog->ic_next;
 					continue; /* Leave this iclog for
 						   * another thread */
@@ -2201,21 +2240,23 @@ xlog_state_do_callback(
 
 				iclog->ic_state = XLOG_STATE_CALLBACK;
 
+				spin_unlock(&log->l_icloglock);
 
-				/*
-				 * update the last_sync_lsn before we drop the
-				 * icloglock to ensure we are the only one that
-				 * can update it.
+				/* l_last_sync_lsn field protected by
+				 * l_grant_lock. Don't worry about iclog's lsn.
+				 * No one else can be here except us.
 				 */
-				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-				atomic64_set(&log->l_last_sync_lsn,
-					be64_to_cpu(iclog->ic_header.h_lsn));
+				spin_lock(&log->l_grant_lock);
+				ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+				       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+				log->l_last_sync_lsn =
+					be64_to_cpu(iclog->ic_header.h_lsn);
+				spin_unlock(&log->l_grant_lock);
 
-			} else
+			} else {
+				spin_unlock(&log->l_icloglock);
 				ioerrors++;
-
-			spin_unlock(&log->l_icloglock);
+			}
 
 			/*
 			 * Keep processing entries in the callback list until
@@ -2256,7 +2297,7 @@ xlog_state_do_callback(
 			xlog_state_clean_log(log);
 
 			/* wake up threads waiting in xfs_log_force() */
-			wake_up_all(&iclog->ic_force_wait);
+			sv_broadcast(&iclog->ic_force_wait);
 
 			iclog = iclog->ic_next;
 		} while (first_iclog != iclog);
@@ -2303,7 +2344,7 @@ xlog_state_do_callback(
 	spin_unlock(&log->l_icloglock);
 
 	if (wake)
-		wake_up_all(&log->l_flush_wait);
+		sv_broadcast(&log->l_flush_wait);
 }
 
 
@@ -2354,7 +2395,7 @@ xlog_state_done_syncing(
 	 * iclog buffer, we wake them all, one will get to do the
 	 * I/O, the others get to wait for the result.
 	 */
-	wake_up_all(&iclog->ic_write_wait);
+	sv_broadcast(&iclog->ic_write_wait);
 	spin_unlock(&log->l_icloglock);
 	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
 }	/* xlog_state_done_syncing */
@@ -2403,7 +2444,7 @@ xlog_state_get_iclog_space(xlog_t	  *log,
 		XFS_STATS_INC(xs_log_noiclogs);
 
 		/* Wait for log writes to have flushed */
-		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
+		sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
 		goto restart;
 	}
 
@@ -2486,18 +2527,6 @@ xlog_state_get_iclog_space(xlog_t	  *log,
  *
  * Once a ticket gets put onto the reserveq, it will only return after
  * the needed reservation is satisfied.
- *
- * This function is structured so that it has a lock free fast path. This is
- * necessary because every new transaction reservation will come through this
- * path. Hence any lock will be globally hot if we take it unconditionally on
- * every pass.
- *
- * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
- * ticket was never added to the reserveq because the t_queue list head will be
- * empty and we hold the only reference to it so it can safely be checked
- * unlocked.
  */
 STATIC int
 xlog_grant_log_space(xlog_t	   *log,
@@ -2505,27 +2534,24 @@ xlog_grant_log_space(xlog_t	   *log,
 {
 	int		 free_bytes;
 	int		 need_bytes;
+#ifdef DEBUG
+	xfs_lsn_t	 tail_lsn;
+#endif
+
 
 #ifdef DEBUG
 	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
 		panic("grant Recovery problem");
 #endif
 
-	trace_xfs_log_grant_enter(log, tic);
+	/* Is there space or do we need to sleep? */
+	spin_lock(&log->l_grant_lock);
 
-	need_bytes = tic->t_unit_res;
-	if (tic->t_flags & XFS_LOG_PERM_RESERV)
-		need_bytes *= tic->t_ocnt;
+	trace_xfs_log_grant_enter(log, tic);
 
 	/* something is already sleeping; insert new transaction at end */
-	if (!list_empty_careful(&log->l_reserveq)) {
-		spin_lock(&log->l_grant_reserve_lock);
-		/* recheck the queue now we are locked */
-		if (list_empty(&log->l_reserveq)) {
-			spin_unlock(&log->l_grant_reserve_lock);
-			goto redo;
-		}
-		list_add_tail(&tic->t_queue, &log->l_reserveq);
+	if (log->l_reserve_headq) {
+		xlog_ins_ticketq(&log->l_reserve_headq, tic);
 
 		trace_xfs_log_grant_sleep1(log, tic);
 
@@ -2537,57 +2563,72 @@ xlog_grant_log_space(xlog_t	   *log,
 			goto error_return;
 
 		XFS_STATS_INC(xs_sleep_logspace);
-		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
+		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 		/*
 		 * If we got an error, and the filesystem is shutting down,
 		 * we'll catch it down below. So just continue...
 		 */
 		trace_xfs_log_grant_wake1(log, tic);
+		spin_lock(&log->l_grant_lock);
 	}
+	if (tic->t_flags & XFS_LOG_PERM_RESERV)
+		need_bytes = tic->t_unit_res*tic->t_ocnt;
+	else
+		need_bytes = tic->t_unit_res;
 
 redo:
 	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
+		goto error_return;
 
-	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+	free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+				     log->l_grant_reserve_bytes);
 	if (free_bytes < need_bytes) {
-		spin_lock(&log->l_grant_reserve_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_reserveq);
+		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+			xlog_ins_ticketq(&log->l_reserve_headq, tic);
 
 		trace_xfs_log_grant_sleep2(log, tic);
 
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		xlog_grant_push_ail(log, need_bytes);
+		spin_unlock(&log->l_grant_lock);
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		spin_lock(&log->l_grant_lock);
 
 		XFS_STATS_INC(xs_sleep_logspace);
-		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+
+		spin_lock(&log->l_grant_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
 
 		trace_xfs_log_grant_wake2(log, tic);
-		goto redo;
-	}
 
-	if (!list_empty(&tic->t_queue)) {
-		spin_lock(&log->l_grant_reserve_lock);
-		list_del_init(&tic->t_queue);
-		spin_unlock(&log->l_grant_reserve_lock);
-	}
+		goto redo;
+	} else if (tic->t_flags & XLOG_TIC_IN_Q)
+		xlog_del_ticketq(&log->l_reserve_headq, tic);
 
 	/* we've got enough space */
-	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
-	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
+	xlog_grant_add_space(log, need_bytes);
+#ifdef DEBUG
+	tail_lsn = log->l_tail_lsn;
+	/*
+	 * Check to make sure the grant write head didn't just over lap the
+	 * tail.  If the cycles are the same, we can't be overlapping.
+	 * Otherwise, make sure that the cycles differ by exactly one and
+	 * check the byte count.
+	 */
+	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
+		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
+		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+	}
+#endif
 	trace_xfs_log_grant_exit(log, tic);
-	xlog_verify_grant_tail(log);
+	xlog_verify_grant_head(log, 1);
+	spin_unlock(&log->l_grant_lock);
 	return 0;
 
-error_return_unlocked:
-	spin_lock(&log->l_grant_reserve_lock);
-error_return:
-	list_del_init(&tic->t_queue);
-	spin_unlock(&log->l_grant_reserve_lock);
+ error_return:
+	if (tic->t_flags & XLOG_TIC_IN_Q)
+		xlog_del_ticketq(&log->l_reserve_headq, tic);
+
 	trace_xfs_log_grant_error(log, tic);
 
 	/*
@@ -2597,6 +2638,7 @@ xlog_grant_log_space(xlog_t	   *log,
 	 */
 	tic->t_curr_res = 0;
 	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	spin_unlock(&log->l_grant_lock);
 	return XFS_ERROR(EIO);
 }	/* xlog_grant_log_space */
 
@@ -2604,14 +2646,17 @@ xlog_grant_log_space(xlog_t	   *log,
 /*
  * Replenish the byte reservation required by moving the grant write head.
  *
- * Similar to xlog_grant_log_space, the function is structured to have a lock
- * free fast path.
+ *
  */
 STATIC int
 xlog_regrant_write_log_space(xlog_t	   *log,
 			     xlog_ticket_t *tic)
 {
 	int		free_bytes, need_bytes;
+	xlog_ticket_t	*ntic;
+#ifdef DEBUG
+	xfs_lsn_t	tail_lsn;
+#endif
 
 	tic->t_curr_res = tic->t_unit_res;
 	xlog_tic_reset_res(tic);
@@ -2624,9 +2669,12 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 		panic("regrant Recovery problem");
 #endif
 
+	spin_lock(&log->l_grant_lock);
+
 	trace_xfs_log_regrant_write_enter(log, tic);
+
 	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
+		goto error_return;
 
 	/* If there are other waiters on the queue then give them a
 	 * chance at logspace before us. Wake up the first waiters,
@@ -2635,76 +2683,92 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 	 * this transaction.
 	 */
 	need_bytes = tic->t_unit_res;
-	if (!list_empty_careful(&log->l_writeq)) {
-		struct xlog_ticket *ntic;
-
-		spin_lock(&log->l_grant_write_lock);
-		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
+	if ((ntic = log->l_write_headq)) {
+		free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+					     log->l_grant_write_bytes);
+		do {
 			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
 
 			if (free_bytes < ntic->t_unit_res)
 				break;
 			free_bytes -= ntic->t_unit_res;
-			wake_up(&ntic->t_wait);
-		}
+			sv_signal(&ntic->t_wait);
+			ntic = ntic->t_next;
+		} while (ntic != log->l_write_headq);
+
+		if (ntic != log->l_write_headq) {
+			if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+				xlog_ins_ticketq(&log->l_write_headq, tic);
 
-		if (ntic != list_first_entry(&log->l_writeq,
-						struct xlog_ticket, t_queue)) {
-			if (list_empty(&tic->t_queue))
-				list_add_tail(&tic->t_queue, &log->l_writeq);
 			trace_xfs_log_regrant_write_sleep1(log, tic);
 
-			xlog_grant_push_ail(log, need_bytes);
+			spin_unlock(&log->l_grant_lock);
+			xlog_grant_push_ail(log->l_mp, need_bytes);
+			spin_lock(&log->l_grant_lock);
 
 			XFS_STATS_INC(xs_sleep_logspace);
-			xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+			sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+				&log->l_grant_lock, s);
+
+			/* If we're shutting down, this tic is already
+			 * off the queue */
+			spin_lock(&log->l_grant_lock);
+			if (XLOG_FORCED_SHUTDOWN(log))
+				goto error_return;
+
 			trace_xfs_log_regrant_write_wake1(log, tic);
-		} else
-			spin_unlock(&log->l_grant_write_lock);
+		}
 	}
 
 redo:
 	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
+		goto error_return;
 
-	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+				     log->l_grant_write_bytes);
 	if (free_bytes < need_bytes) {
-		spin_lock(&log->l_grant_write_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_writeq);
-
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		xlog_grant_push_ail(log, need_bytes);
+		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+			xlog_ins_ticketq(&log->l_write_headq, tic);
+		spin_unlock(&log->l_grant_lock);
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		spin_lock(&log->l_grant_lock);
 
 		XFS_STATS_INC(xs_sleep_logspace);
 		trace_xfs_log_regrant_write_sleep2(log, tic);
-		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+
+		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+
+		/* If we're shutting down, this tic is already off the queue */
+		spin_lock(&log->l_grant_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
 
 		trace_xfs_log_regrant_write_wake2(log, tic);
 		goto redo;
-	}
+	} else if (tic->t_flags & XLOG_TIC_IN_Q)
+		xlog_del_ticketq(&log->l_write_headq, tic);
 
-	if (!list_empty(&tic->t_queue)) {
-		spin_lock(&log->l_grant_write_lock);
-		list_del_init(&tic->t_queue);
-		spin_unlock(&log->l_grant_write_lock);
+	/* we've got enough space */
+	xlog_grant_add_space_write(log, need_bytes);
+#ifdef DEBUG
+	tail_lsn = log->l_tail_lsn;
+	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
+		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
+		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
 	}
+#endif
 
-	/* we've got enough space */
-	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_regrant_write_exit(log, tic);
-	xlog_verify_grant_tail(log);
+
+	xlog_verify_grant_head(log, 1);
+	spin_unlock(&log->l_grant_lock);
 	return 0;
 
 
- error_return_unlocked:
-	spin_lock(&log->l_grant_write_lock);
  error_return:
-	list_del_init(&tic->t_queue);
-	spin_unlock(&log->l_grant_write_lock);
+	if (tic->t_flags & XLOG_TIC_IN_Q)
+		xlog_del_ticketq(&log->l_reserve_headq, tic);
+
 	trace_xfs_log_regrant_write_error(log, tic);
 
 	/*
@@ -2714,6 +2778,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 	 */
 	tic->t_curr_res = 0;
 	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	spin_unlock(&log->l_grant_lock);
 	return XFS_ERROR(EIO);
 }	/* xlog_regrant_write_log_space */
 
@@ -2734,24 +2799,27 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 	if (ticket->t_cnt > 0)
 		ticket->t_cnt--;
 
-	xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-					ticket->t_curr_res);
-	xlog_grant_sub_space(log, &log->l_grant_write_head,
-					ticket->t_curr_res);
+	spin_lock(&log->l_grant_lock);
+	xlog_grant_sub_space(log, ticket->t_curr_res);
 	ticket->t_curr_res = ticket->t_unit_res;
 	xlog_tic_reset_res(ticket);
 
 	trace_xfs_log_regrant_reserve_sub(log, ticket);
 
+	xlog_verify_grant_head(log, 1);
+
 	/* just return if we still have some of the pre-reserved space */
-	if (ticket->t_cnt > 0)
+	if (ticket->t_cnt > 0) {
+		spin_unlock(&log->l_grant_lock);
 		return;
+	}
 
-	xlog_grant_add_space(log, &log->l_grant_reserve_head,
-					ticket->t_unit_res);
+	xlog_grant_add_space_reserve(log, ticket->t_unit_res);
 
 	trace_xfs_log_regrant_reserve_exit(log, ticket);
 
+	xlog_verify_grant_head(log, 0);
+	spin_unlock(&log->l_grant_lock);
 	ticket->t_curr_res = ticket->t_unit_res;
 	xlog_tic_reset_res(ticket);
 }	/* xlog_regrant_reserve_log_space */
@@ -2775,29 +2843,28 @@ STATIC void
 xlog_ungrant_log_space(xlog_t	     *log,
 		       xlog_ticket_t *ticket)
 {
-	int	bytes;
-
 	if (ticket->t_cnt > 0)
 		ticket->t_cnt--;
 
+	spin_lock(&log->l_grant_lock);
 	trace_xfs_log_ungrant_enter(log, ticket);
+
+	xlog_grant_sub_space(log, ticket->t_curr_res);
+
 	trace_xfs_log_ungrant_sub(log, ticket);
 
-	/*
-	 * If this is a permanent reservation ticket, we may be able to free
+	/* If this is a permanent reservation ticket, we may be able to free
 	 * up more space based on the remaining count.
 	 */
-	bytes = ticket->t_curr_res;
 	if (ticket->t_cnt > 0) {
 		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-		bytes += ticket->t_unit_res*ticket->t_cnt;
+		xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
 	}
 
-	xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
-	xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
-
 	trace_xfs_log_ungrant_exit(log, ticket);
 
+	xlog_verify_grant_head(log, 1);
+	spin_unlock(&log->l_grant_lock);
 	xfs_log_move_tail(log->l_mp, 1);
 }	/* xlog_ungrant_log_space */
 
@@ -2834,11 +2901,11 @@ xlog_state_release_iclog(
 
 	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
 		/* update tail before writing to iclog */
-		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+		xlog_assign_tail_lsn(log->l_mp);
 		sync++;
 		iclog->ic_state = XLOG_STATE_SYNCING;
-		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-		xlog_verify_tail_lsn(log, iclog, tail_lsn);
+		iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+		xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
 		/* cycle incremented when incrementing curr_block */
 	}
 	spin_unlock(&log->l_icloglock);
@@ -3021,7 +3088,7 @@ _xfs_log_force(
 			return XFS_ERROR(EIO);
 		}
 		XFS_STATS_INC(xs_log_force_sleep);
-		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+		sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
 		/*
 		 * No need to grab the log lock here since we're
 		 * only deciding whether or not to return EIO
@@ -3139,8 +3206,8 @@ _xfs_log_force_lsn(
 
 				XFS_STATS_INC(xs_log_force_sleep);
 
-				xlog_wait(&iclog->ic_prev->ic_write_wait,
-							&log->l_icloglock);
+				sv_wait(&iclog->ic_prev->ic_write_wait,
+					PSWP, &log->l_icloglock, s);
 				if (log_flushed)
 					*log_flushed = 1;
 				already_slept = 1;
@@ -3168,7 +3235,7 @@ _xfs_log_force_lsn(
 				return XFS_ERROR(EIO);
 			}
 			XFS_STATS_INC(xs_log_force_sleep);
-			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+			sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
 			/*
 			 * No need to grab the log lock here since we're
 			 * only deciding whether or not to return EIO
@@ -3243,8 +3310,10 @@ xfs_log_ticket_put(
 	xlog_ticket_t	*ticket)
 {
 	ASSERT(atomic_read(&ticket->t_ref) > 0);
-	if (atomic_dec_and_test(&ticket->t_ref))
+	if (atomic_dec_and_test(&ticket->t_ref)) {
+		sv_destroy(&ticket->t_wait);
 		kmem_zone_free(xfs_log_ticket_zone, ticket);
+	}
 }
 
 xlog_ticket_t *
@@ -3366,7 +3435,6 @@ xlog_ticket_alloc(
         }
 
 	atomic_set(&tic->t_ref, 1);
-	INIT_LIST_HEAD(&tic->t_queue);
 	tic->t_unit_res		= unit_bytes;
 	tic->t_curr_res		= unit_bytes;
 	tic->t_cnt		= cnt;
@@ -3377,7 +3445,7 @@ xlog_ticket_alloc(
 	tic->t_trans_type	= 0;
 	if (xflags & XFS_LOG_PERM_RESERV)
 		tic->t_flags |= XLOG_TIC_PERM_RESERV;
-	init_waitqueue_head(&tic->t_wait);
+	sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
 
 	xlog_tic_reset_res(tic);
 
@@ -3416,25 +3484,18 @@ xlog_verify_dest_ptr(
 }
 
 STATIC void
-xlog_verify_grant_tail(
-	struct log	*log)
+xlog_verify_grant_head(xlog_t *log, int equals)
 {
-	int		tail_cycle, tail_blocks;
-	int		cycle, space;
-
-	/*
-	 * Check to make sure the grant write head didn't just over lap the
-	 * tail.  If the cycles are the same, we can't be overlapping.
-	 * Otherwise, make sure that the cycles differ by exactly one and
-	 * check the byte count.
-	 */
-	xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
-	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
-	if (tail_cycle != cycle) {
-		ASSERT(cycle - 1 == tail_cycle);
-		ASSERT(space <= BBTOB(tail_blocks));
-	}
-}
+    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+	if (equals)
+	    ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
+	else
+	    ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+    } else {
+	ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+	ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+    }
+}	/* xlog_verify_grant_head */
 
 /* check if it will fit */
 STATIC void
@@ -3655,10 +3716,12 @@ xfs_log_force_umount(
 		xlog_cil_force(log);
 
 	/*
-	 * mark the filesystem and the as in a shutdown state and wake
-	 * everybody up to tell them the bad news.
+	 * We must hold both the GRANT lock and the LOG lock,
+	 * before we mark the filesystem SHUTDOWN and wake
+	 * everybody up to tell the bad news.
 	 */
 	spin_lock(&log->l_icloglock);
+	spin_lock(&log->l_grant_lock);
 	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
 	if (mp->m_sb_bp)
 		XFS_BUF_DONE(mp->m_sb_bp);
@@ -3679,21 +3742,27 @@ xfs_log_force_umount(
 	spin_unlock(&log->l_icloglock);
 
 	/*
-	 * We don't want anybody waiting for log reservations after this. That
-	 * means we have to wake up everybody queued up on reserveq as well as
-	 * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-	 * we don't enqueue anything once the SHUTDOWN flag is set, and this
-	 * action is protected by the grant locks.
+	 * We don't want anybody waiting for log reservations
+	 * after this. That means we have to wake up everybody
+	 * queued up on reserve_headq as well as write_headq.
+	 * In addition, we make sure in xlog_{re}grant_log_space
+	 * that we don't enqueue anything once the SHUTDOWN flag
+	 * is set, and this action is protected by the GRANTLOCK.
 	 */
-	spin_lock(&log->l_grant_reserve_lock);
-	list_for_each_entry(tic, &log->l_reserveq, t_queue)
-		wake_up(&tic->t_wait);
-	spin_unlock(&log->l_grant_reserve_lock);
-
-	spin_lock(&log->l_grant_write_lock);
-	list_for_each_entry(tic, &log->l_writeq, t_queue)
-		wake_up(&tic->t_wait);
-	spin_unlock(&log->l_grant_write_lock);
+	if ((tic = log->l_reserve_headq)) {
+		do {
+			sv_signal(&tic->t_wait);
+			tic = tic->t_next;
+		} while (tic != log->l_reserve_headq);
+	}
+
+	if ((tic = log->l_write_headq)) {
+		do {
+			sv_signal(&tic->t_wait);
+			tic = tic->t_next;
+		} while (tic != log->l_write_headq);
+	}
+	spin_unlock(&log->l_grant_lock);
 
 	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
 		ASSERT(!logerror);
diff --git a/trunk/fs/xfs/xfs_log_cil.c b/trunk/fs/xfs/xfs_log_cil.c
index 9dc8125d04e5..23d6ceb5e97b 100644
--- a/trunk/fs/xfs/xfs_log_cil.c
+++ b/trunk/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
 	INIT_LIST_HEAD(&cil->xc_committing);
 	spin_lock_init(&cil->xc_cil_lock);
 	init_rwsem(&cil->xc_ctx_lock);
-	init_waitqueue_head(&cil->xc_commit_wait);
+	sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
 
 	INIT_LIST_HEAD(&ctx->committing);
 	INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,10 +361,15 @@ xlog_cil_committed(
 	int	abort)
 {
 	struct xfs_cil_ctx	*ctx = args;
+	struct xfs_log_vec	*lv;
+	int			abortflag = abort ? XFS_LI_ABORTED : 0;
 	struct xfs_busy_extent	*busyp, *n;
 
-	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-					ctx->start_lsn, abort);
+	/* unpin all the log items */
+	for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+		xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+							abortflag);
+	}
 
 	list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
 		xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -563,7 +568,7 @@ xlog_cil_push(
 			 * It is still being pushed! Wait for the push to
 			 * complete, then start again from the beginning.
 			 */
-			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
 			goto restart;
 		}
 	}
@@ -587,7 +592,7 @@ xlog_cil_push(
 	 */
 	spin_lock(&cil->xc_cil_lock);
 	ctx->commit_lsn = commit_lsn;
-	wake_up_all(&cil->xc_commit_wait);
+	sv_broadcast(&cil->xc_commit_wait);
 	spin_unlock(&cil->xc_cil_lock);
 
 	/* release the hounds! */
@@ -752,7 +757,7 @@ xlog_cil_force_lsn(
 			 * It is still being pushed! Wait for the push to
 			 * complete, then start again from the beginning.
 			 */
-			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
 			goto restart;
 		}
 		if (ctx->sequence != sequence)
diff --git a/trunk/fs/xfs/xfs_log_priv.h b/trunk/fs/xfs/xfs_log_priv.h
index d5f8be8f4bf6..edcdfe01617f 100644
--- a/trunk/fs/xfs/xfs_log_priv.h
+++ b/trunk/fs/xfs/xfs_log_priv.h
@@ -21,6 +21,7 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
+struct xfs_buf_cancel;
 struct xfs_mount;
 
 /*
@@ -53,6 +54,7 @@ struct xfs_mount;
 	BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
 	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 
+
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
 	return ((xfs_lsn_t)cycle << 32) | block;
@@ -131,10 +133,12 @@ static inline uint xlog_get_client_id(__be32 i)
  */
 #define XLOG_TIC_INITED		0x1	/* has been initialized */
 #define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
+#define XLOG_TIC_IN_Q		0x4
 
 #define XLOG_TIC_FLAGS \
 	{ XLOG_TIC_INITED,	"XLOG_TIC_INITED" }, \
-	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }
+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }, \
+	{ XLOG_TIC_IN_Q,	"XLOG_TIC_IN_Q" }
 
 #endif	/* __KERNEL__ */
 
@@ -240,8 +244,9 @@ typedef struct xlog_res {
 } xlog_res_t;
 
 typedef struct xlog_ticket {
-	wait_queue_head_t  t_wait;	 /* ticket wait queue */
-	struct list_head   t_queue;	 /* reserve/write queue */
+	sv_t		   t_wait;	 /* ticket wait queue            : 20 */
+	struct xlog_ticket *t_next;	 /*			         :4|8 */
+	struct xlog_ticket *t_prev;	 /*				 :4|8 */
 	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
 	atomic_t	   t_ref;	 /* ticket reference count       : 4  */
 	int		   t_curr_res;	 /* current reservation in bytes : 4  */
@@ -348,8 +353,8 @@ typedef union xlog_in_core2 {
  * and move everything else out to subsequent cachelines.
  */
 typedef struct xlog_in_core {
-	wait_queue_head_t	ic_force_wait;
-	wait_queue_head_t	ic_write_wait;
+	sv_t			ic_force_wait;
+	sv_t			ic_write_wait;
 	struct xlog_in_core	*ic_next;
 	struct xlog_in_core	*ic_prev;
 	struct xfs_buf		*ic_bp;
@@ -416,7 +421,7 @@ struct xfs_cil {
 	struct xfs_cil_ctx	*xc_ctx;
 	struct rw_semaphore	xc_ctx_lock;
 	struct list_head	xc_committing;
-	wait_queue_head_t	xc_commit_wait;
+	sv_t			xc_commit_wait;
 	xfs_lsn_t		xc_current_sequence;
 };
 
@@ -486,7 +491,7 @@ typedef struct log {
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-	struct list_head	*l_buf_cancel_table;
+	struct xfs_buf_cancel	**l_buf_cancel_table;
 	int			l_iclog_hsize;  /* size of iclog header */
 	int			l_iclog_heads;  /* # of iclog header sectors */
 	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -498,40 +503,29 @@ typedef struct log {
 	int			l_logBBsize;    /* size of log in BB chunks */
 
 	/* The following block of fields are changed while holding icloglock */
-	wait_queue_head_t	l_flush_wait ____cacheline_aligned_in_smp;
+	sv_t			l_flush_wait ____cacheline_aligned_in_smp;
 						/* waiting for iclog flush */
 	int			l_covered_state;/* state of "covering disk
 						 * log entries" */
 	xlog_in_core_t		*l_iclog;       /* head log queue	*/
 	spinlock_t		l_icloglock;    /* grab to change iclog state */
+	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
+						 * buffers */
+	xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
 	int			l_curr_cycle;   /* Cycle number of log writes */
 	int			l_prev_cycle;   /* Cycle number before last
 						 * block increment */
 	int			l_curr_block;   /* current logical log block */
 	int			l_prev_block;   /* previous logical log block */
 
-	/*
-	 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-	 * read without needing to hold specific locks. To avoid operations
-	 * contending with other hot objects, place each of them on a separate
-	 * cacheline.
-	 */
-	/* lsn of last LR on disk */
-	atomic64_t		l_last_sync_lsn ____cacheline_aligned_in_smp;
-	/* lsn of 1st LR with unflushed * buffers */
-	atomic64_t		l_tail_lsn ____cacheline_aligned_in_smp;
-
-	/*
-	 * ticket grant locks, queues and accounting have their own cachlines
-	 * as these are quite hot and can be operated on concurrently.
-	 */
-	spinlock_t		l_grant_reserve_lock ____cacheline_aligned_in_smp;
-	struct list_head	l_reserveq;
-	atomic64_t		l_grant_reserve_head;
-
-	spinlock_t		l_grant_write_lock ____cacheline_aligned_in_smp;
-	struct list_head	l_writeq;
-	atomic64_t		l_grant_write_head;
+	/* The following block of fields are changed while holding grant_lock */
+	spinlock_t		l_grant_lock ____cacheline_aligned_in_smp;
+	xlog_ticket_t		*l_reserve_headq;
+	xlog_ticket_t		*l_write_headq;
+	int			l_grant_reserve_cycle;
+	int			l_grant_reserve_bytes;
+	int			l_grant_write_cycle;
+	int			l_grant_write_bytes;
 
 	/* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -540,9 +534,6 @@ typedef struct log {
 
 } xlog_t;
 
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
-	((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
 
 /* common routines */
@@ -570,61 +561,6 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
 				struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
 				xlog_in_core_t **commit_iclog, uint flags);
 
-/*
- * When we crack an atomic LSN, we sample it first so that the value will not
- * change while we are cracking it into the component values. This means we
- * will always get consistent component values to work from. This should always
- * be used to smaple and crack LSNs taht are stored and updated in atomic
- * variables.
- */
-static inline void
-xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
-{
-	xfs_lsn_t val = atomic64_read(lsn);
-
-	*cycle = CYCLE_LSN(val);
-	*block = BLOCK_LSN(val);
-}
-
-/*
- * Calculate and assign a value to an atomic LSN variable from component pieces.
- */
-static inline void
-xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
-{
-	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
-}
-
-/*
- * When we crack the grant head, we sample it first so that the value will not
- * change while we are cracking it into the component values. This means we
- * will always get consistent component values to work from.
- */
-static inline void
-xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
-{
-	*cycle = val >> 32;
-	*space = val & 0xffffffff;
-}
-
-static inline void
-xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
-{
-	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
-}
-
-static inline int64_t
-xlog_assign_grant_head_val(int cycle, int space)
-{
-	return ((int64_t)cycle << 32) | space;
-}
-
-static inline void
-xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
-{
-	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
-}
-
 /*
  * Committed Item List interfaces
  */
@@ -649,21 +585,6 @@ xlog_cil_force(struct log *log)
  */
 #define XLOG_UNMOUNT_REC_TYPE	(-1U)
 
-/*
- * Wrapper function for waiting on a wait queue serialised against wakeups
- * by a spinlock. This matches the semantics of all the wait queues used in the
- * log code.
- */
-static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue_exclusive(wq, &wait);
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	spin_unlock(lock);
-	schedule();
-	remove_wait_queue(wq, &wait);
-}
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/trunk/fs/xfs/xfs_log_recover.c b/trunk/fs/xfs/xfs_log_recover.c
index 204d8e5fa7fa..966d3f97458c 100644
--- a/trunk/fs/xfs/xfs_log_recover.c
+++ b/trunk/fs/xfs/xfs_log_recover.c
@@ -52,17 +52,6 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #define	xlog_recover_check_summary(log)
 #endif
 
-/*
- * This structure is used during recovery to record the buf log items which
- * have been canceled and should not be replayed.
- */
-struct xfs_buf_cancel {
-	xfs_daddr_t		bc_blkno;
-	uint			bc_len;
-	int			bc_refcount;
-	struct list_head	bc_list;
-};
-
 /*
  * Sector aligned buffer routines for buffer create/read/write/access
  */
@@ -936,12 +925,12 @@ xlog_find_tail(
 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 	if (found == 2)
 		log->l_curr_cycle++;
-	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-	xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-					BBTOB(log->l_curr_block));
-	xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-					BBTOB(log->l_curr_block));
+	log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+	log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+	log->l_grant_reserve_cycle = log->l_curr_cycle;
+	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+	log->l_grant_write_cycle = log->l_curr_cycle;
+	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
 
 	/*
 	 * Look for unmount record.  If we find it, then we know there
@@ -971,7 +960,7 @@ xlog_find_tail(
 	}
 	after_umount_blk = (i + hblks + (int)
 		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-	tail_lsn = atomic64_read(&log->l_tail_lsn);
+	tail_lsn = log->l_tail_lsn;
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
 		umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -986,10 +975,12 @@ xlog_find_tail(
 			 * log records will point recovery to after the
 			 * current unmount record.
 			 */
-			xlog_assign_atomic_lsn(&log->l_tail_lsn,
-					log->l_curr_cycle, after_umount_blk);
-			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-					log->l_curr_cycle, after_umount_blk);
+			log->l_tail_lsn =
+				xlog_assign_lsn(log->l_curr_cycle,
+						after_umount_blk);
+			log->l_last_sync_lsn =
+				xlog_assign_lsn(log->l_curr_cycle,
+						after_umount_blk);
 			*tail_blk = after_umount_blk;
 
 			/*
@@ -1614,45 +1605,82 @@ xlog_recover_reorder_trans(
  * record in the table to tell us how many times we expect to see this
  * record during the second pass.
  */
-STATIC int
-xlog_recover_buffer_pass1(
-	struct log		*log,
-	xlog_recover_item_t	*item)
+STATIC void
+xlog_recover_do_buffer_pass1(
+	xlog_t			*log,
+	xfs_buf_log_format_t	*buf_f)
 {
-	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
-	struct list_head	*bucket;
-	struct xfs_buf_cancel	*bcp;
+	xfs_buf_cancel_t	*bcp;
+	xfs_buf_cancel_t	*nextp;
+	xfs_buf_cancel_t	*prevp;
+	xfs_buf_cancel_t	**bucket;
+	xfs_daddr_t		blkno = 0;
+	uint			len = 0;
+	ushort			flags = 0;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		len = buf_f->blf_len;
+		flags = buf_f->blf_flags;
+		break;
+	}
 
 	/*
 	 * If this isn't a cancel buffer item, then just return.
 	 */
-	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+	if (!(flags & XFS_BLF_CANCEL)) {
 		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-		return 0;
+		return;
 	}
 
 	/*
-	 * Insert an xfs_buf_cancel record into the hash table of them.
-	 * If there is already an identical record, bump its reference count.
+	 * Insert an xfs_buf_cancel record into the hash table of
+	 * them.  If there is already an identical record, bump
+	 * its reference count.
 	 */
-	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-	list_for_each_entry(bcp, bucket, bc_list) {
-		if (bcp->bc_blkno == buf_f->blf_blkno &&
-		    bcp->bc_len == buf_f->blf_len) {
-			bcp->bc_refcount++;
-			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-			return 0;
-		}
+	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+					  XLOG_BC_TABLE_SIZE];
+	/*
+	 * If the hash bucket is empty then just insert a new record into
+	 * the bucket.
+	 */
+	if (*bucket == NULL) {
+		bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+						     KM_SLEEP);
+		bcp->bc_blkno = blkno;
+		bcp->bc_len = len;
+		bcp->bc_refcount = 1;
+		bcp->bc_next = NULL;
+		*bucket = bcp;
+		return;
 	}
 
-	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-	bcp->bc_blkno = buf_f->blf_blkno;
-	bcp->bc_len = buf_f->blf_len;
+	/*
+	 * The hash bucket is not empty, so search for duplicates of our
+	 * record.  If we find one them just bump its refcount.  If not
+	 * then add us at the end of the list.
+	 */
+	prevp = NULL;
+	nextp = *bucket;
+	while (nextp != NULL) {
+		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+			nextp->bc_refcount++;
+			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
+			return;
+		}
+		prevp = nextp;
+		nextp = nextp->bc_next;
+	}
+	ASSERT(prevp != NULL);
+	bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+					     KM_SLEEP);
+	bcp->bc_blkno = blkno;
+	bcp->bc_len = len;
 	bcp->bc_refcount = 1;
-	list_add_tail(&bcp->bc_list, bucket);
-
+	bcp->bc_next = NULL;
+	prevp->bc_next = bcp;
 	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
-	return 0;
 }
 
 /*
@@ -1670,13 +1698,14 @@ xlog_recover_buffer_pass1(
  */
 STATIC int
 xlog_check_buffer_cancelled(
-	struct log		*log,
+	xlog_t			*log,
 	xfs_daddr_t		blkno,
 	uint			len,
 	ushort			flags)
 {
-	struct list_head	*bucket;
-	struct xfs_buf_cancel	*bcp;
+	xfs_buf_cancel_t	*bcp;
+	xfs_buf_cancel_t	*prevp;
+	xfs_buf_cancel_t	**bucket;
 
 	if (log->l_buf_cancel_table == NULL) {
 		/*
@@ -1687,70 +1716,128 @@ xlog_check_buffer_cancelled(
 		return 0;
 	}
 
+	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+					  XLOG_BC_TABLE_SIZE];
+	bcp = *bucket;
+	if (bcp == NULL) {
+		/*
+		 * There is no corresponding entry in the table built
+		 * in pass one, so this buffer has not been cancelled.
+		 */
+		ASSERT(!(flags & XFS_BLF_CANCEL));
+		return 0;
+	}
+
 	/*
-	 * Search for an entry in the  cancel table that matches our buffer.
+	 * Search for an entry in the buffer cancel table that
+	 * matches our buffer.
 	 */
-	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-	list_for_each_entry(bcp, bucket, bc_list) {
-		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-			goto found;
+	prevp = NULL;
+	while (bcp != NULL) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+			/*
+			 * We've go a match, so return 1 so that the
+			 * recovery of this buffer is cancelled.
+			 * If this buffer is actually a buffer cancel
+			 * log item, then decrement the refcount on the
+			 * one in the table and remove it if this is the
+			 * last reference.
+			 */
+			if (flags & XFS_BLF_CANCEL) {
+				bcp->bc_refcount--;
+				if (bcp->bc_refcount == 0) {
+					if (prevp == NULL) {
+						*bucket = bcp->bc_next;
+					} else {
+						prevp->bc_next = bcp->bc_next;
+					}
+					kmem_free(bcp);
+				}
+			}
+			return 1;
+		}
+		prevp = bcp;
+		bcp = bcp->bc_next;
 	}
-
 	/*
-	 * We didn't find a corresponding entry in the table, so return 0 so
-	 * that the buffer is NOT cancelled.
+	 * We didn't find a corresponding entry in the table, so
+	 * return 0 so that the buffer is NOT cancelled.
 	 */
 	ASSERT(!(flags & XFS_BLF_CANCEL));
 	return 0;
+}
 
-found:
-	/*
-	 * We've go a match, so return 1 so that the recovery of this buffer
-	 * is cancelled.  If this buffer is actually a buffer cancel log
-	 * item, then decrement the refcount on the one in the table and
-	 * remove it if this is the last reference.
-	 */
-	if (flags & XFS_BLF_CANCEL) {
-		if (--bcp->bc_refcount == 0) {
-			list_del(&bcp->bc_list);
-			kmem_free(bcp);
-		}
+STATIC int
+xlog_recover_do_buffer_pass2(
+	xlog_t			*log,
+	xfs_buf_log_format_t	*buf_f)
+{
+	xfs_daddr_t		blkno = 0;
+	ushort			flags = 0;
+	uint			len = 0;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		flags = buf_f->blf_flags;
+		len = buf_f->blf_len;
+		break;
 	}
-	return 1;
+
+	return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * data which should be recovered is that which corresponds to the
- * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * data for the inodes is always logged through the inodes themselves rather
- * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ * Perform recovery for a buffer full of inodes.  In these buffers,
+ * the only data which should be recovered is that which corresponds
+ * to the di_next_unlinked pointers in the on disk inode structures.
+ * The rest of the data for the inodes is always logged through the
+ * inodes themselves rather than the inode buffer and is recovered
+ * in xlog_recover_do_inode_trans().
  *
- * The only time when buffers full of inodes are fully recovered is when the
- * buffer is full of newly allocated inodes.  In this case the buffer will
- * not be marked as an inode buffer and so will be sent to
- * xlog_recover_do_reg_buffer() below during recovery.
+ * The only time when buffers full of inodes are fully recovered is
+ * when the buffer is full of newly allocated inodes.  In this case
+ * the buffer will not be marked as an inode buffer and so will be
+ * sent to xlog_recover_do_reg_buffer() below during recovery.
  */
 STATIC int
 xlog_recover_do_inode_buffer(
-	struct xfs_mount	*mp,
+	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
-	struct xfs_buf		*bp,
+	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
-	int			item_index = 0;
-	int			bit = 0;
-	int			nbits = 0;
-	int			reg_buf_offset = 0;
-	int			reg_buf_bytes = 0;
+	int			item_index;
+	int			bit;
+	int			nbits;
+	int			reg_buf_offset;
+	int			reg_buf_bytes;
 	int			next_unlinked_offset;
 	int			inodes_per_buf;
 	xfs_agino_t		*logged_nextp;
 	xfs_agino_t		*buffer_nextp;
+	unsigned int		*data_map = NULL;
+	unsigned int		map_size = 0;
 
 	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
 
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		break;
+	}
+	/*
+	 * Set the variables corresponding to the current region to
+	 * 0 so that we'll initialize them on the first pass through
+	 * the loop.
+	 */
+	reg_buf_offset = 0;
+	reg_buf_bytes = 0;
+	bit = 0;
+	nbits = 0;
+	item_index = 0;
 	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < inodes_per_buf; i++) {
 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1765,18 +1852,18 @@ xlog_recover_do_inode_buffer(
 			 * the current di_next_unlinked field.
 			 */
 			bit += nbits;
-			bit = xfs_next_bit(buf_f->blf_data_map,
-					   buf_f->blf_map_size, bit);
+			bit = xfs_next_bit(data_map, map_size, bit);
 
 			/*
 			 * If there are no more logged regions in the
 			 * buffer, then we're done.
 			 */
-			if (bit == -1)
+			if (bit == -1) {
 				return 0;
+			}
 
-			nbits = xfs_contig_bits(buf_f->blf_data_map,
-						buf_f->blf_map_size, bit);
+			nbits = xfs_contig_bits(data_map, map_size,
+							 bit);
 			ASSERT(nbits > 0);
 			reg_buf_offset = bit << XFS_BLF_SHIFT;
 			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1788,8 +1875,9 @@ xlog_recover_do_inode_buffer(
 		 * di_next_unlinked field, then move on to the next
 		 * di_next_unlinked field.
 		 */
-		if (next_unlinked_offset < reg_buf_offset)
+		if (next_unlinked_offset < reg_buf_offset) {
 			continue;
+		}
 
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
 		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1825,29 +1913,36 @@ xlog_recover_do_inode_buffer(
  * given buffer.  The bitmap in the buf log format structure indicates
  * where to place the logged data.
  */
+/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
 	struct xfs_mount	*mp,
 	xlog_recover_item_t	*item,
-	struct xfs_buf		*bp,
+	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
 	int			bit;
 	int			nbits;
+	unsigned int		*data_map = NULL;
+	unsigned int		map_size = 0;
 	int                     error;
 
 	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
 
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		break;
+	}
 	bit = 0;
 	i = 1;  /* 0 is the buf format structure */
 	while (1) {
-		bit = xfs_next_bit(buf_f->blf_data_map,
-				   buf_f->blf_map_size, bit);
+		bit = xfs_next_bit(data_map, map_size, bit);
 		if (bit == -1)
 			break;
-		nbits = xfs_contig_bits(buf_f->blf_data_map,
-					buf_f->blf_map_size, bit);
+		nbits = xfs_contig_bits(data_map, map_size, bit);
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
 		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2081,46 +2176,77 @@ xlog_recover_do_dquot_buffer(
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
-xlog_recover_buffer_pass2(
+xlog_recover_do_buffer_trans(
 	xlog_t			*log,
-	xlog_recover_item_t	*item)
+	xlog_recover_item_t	*item,
+	int			pass)
 {
 	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
-	xfs_mount_t		*mp = log->l_mp;
+	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	int			error;
+	int			cancel;
+	xfs_daddr_t		blkno;
+	int			len;
+	ushort			flags;
 	uint			buf_flags;
 
-	/*
-	 * In this pass we only want to recover all the buffers which have
-	 * not been cancelled and are not cancellation buffers themselves.
-	 */
-	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-			buf_f->blf_len, buf_f->blf_flags)) {
-		trace_xfs_log_recover_buf_cancel(log, buf_f);
+	if (pass == XLOG_RECOVER_PASS1) {
+		/*
+		 * In this pass we're only looking for buf items
+		 * with the XFS_BLF_CANCEL bit set.
+		 */
+		xlog_recover_do_buffer_pass1(log, buf_f);
 		return 0;
+	} else {
+		/*
+		 * In this pass we want to recover all the buffers
+		 * which have not been cancelled and are not
+		 * cancellation buffers themselves.  The routine
+		 * we call here will tell us whether or not to
+		 * continue with the replay of this buffer.
+		 */
+		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
+		if (cancel) {
+			trace_xfs_log_recover_buf_cancel(log, buf_f);
+			return 0;
+		}
 	}
-
 	trace_xfs_log_recover_buf_recover(log, buf_f);
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		len = buf_f->blf_len;
+		flags = buf_f->blf_flags;
+		break;
+	default:
+		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+			"xfs_log_recover: unknown buffer type 0x%x, logdev %s",
+			buf_f->blf_type, log->l_mp->m_logname ?
+			log->l_mp->m_logname : "internal");
+		XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
+				 XFS_ERRLEVEL_LOW, log->l_mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
 
+	mp = log->l_mp;
 	buf_flags = XBF_LOCK;
-	if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
+	if (!(flags & XFS_BLF_INODE_BUF))
 		buf_flags |= XBF_MAPPED;
 
-	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
-			  buf_flags);
+	bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
 	if (XFS_BUF_ISERROR(bp)) {
-		xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-				  bp, buf_f->blf_blkno);
+		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+				  bp, blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
 		return error;
 	}
 
 	error = 0;
-	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+	if (flags & XFS_BLF_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-	} else if (buf_f->blf_flags &
+	} else if (flags &
 		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
@@ -2160,14 +2286,16 @@ xlog_recover_buffer_pass2(
 }
 
 STATIC int
-xlog_recover_inode_pass2(
+xlog_recover_do_inode_trans(
 	xlog_t			*log,
-	xlog_recover_item_t	*item)
+	xlog_recover_item_t	*item,
+	int			pass)
 {
 	xfs_inode_log_format_t	*in_f;
-	xfs_mount_t		*mp = log->l_mp;
+	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	xfs_dinode_t		*dip;
+	xfs_ino_t		ino;
 	int			len;
 	xfs_caddr_t		src;
 	xfs_caddr_t		dest;
@@ -2177,6 +2305,10 @@ xlog_recover_inode_pass2(
 	xfs_icdinode_t		*dicp;
 	int			need_free = 0;
 
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+
 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
 		in_f = item->ri_buf[0].i_addr;
 	} else {
@@ -2186,6 +2318,8 @@ xlog_recover_inode_pass2(
 		if (error)
 			goto error;
 	}
+	ino = in_f->ilf_ino;
+	mp = log->l_mp;
 
 	/*
 	 * Inode buffers can be freed, look out for it,
@@ -2220,8 +2354,8 @@ xlog_recover_inode_pass2(
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-			dip, bp, in_f->ilf_ino);
-		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
+			dip, bp, ino);
+		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
@@ -2231,8 +2365,8 @@ xlog_recover_inode_pass2(
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-			item, in_f->ilf_ino);
-		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
+			item, ino);
+		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
@@ -2260,12 +2394,12 @@ xlog_recover_inode_pass2(
 	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
+			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
 					 XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-				item, dip, bp, in_f->ilf_ino);
+				item, dip, bp, ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
@@ -2273,40 +2407,40 @@ xlog_recover_inode_pass2(
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
 		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
+			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
 					     XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-				item, dip, bp, in_f->ilf_ino);
+				item, dip, bp, ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
+		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-			item, dip, bp, in_f->ilf_ino,
+			item, dip, bp, ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
+		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+			item, dip, bp, ino, dicp->di_forkoff);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
+		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2398,7 +2532,7 @@ xlog_recover_inode_pass2(
 			break;
 
 		default:
-			xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
+			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
 			ASSERT(0);
 			xfs_buf_relse(bp);
 			error = EIO;
@@ -2422,11 +2556,18 @@ xlog_recover_inode_pass2(
  * of that type.
  */
 STATIC int
-xlog_recover_quotaoff_pass1(
+xlog_recover_do_quotaoff_trans(
 	xlog_t			*log,
-	xlog_recover_item_t	*item)
+	xlog_recover_item_t	*item,
+	int			pass)
 {
-	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
+	xfs_qoff_logformat_t	*qoff_f;
+
+	if (pass == XLOG_RECOVER_PASS2) {
+		return (0);
+	}
+
+	qoff_f = item->ri_buf[0].i_addr;
 	ASSERT(qoff_f);
 
 	/*
@@ -2447,17 +2588,22 @@ xlog_recover_quotaoff_pass1(
  * Recover a dquot record
  */
 STATIC int
-xlog_recover_dquot_pass2(
+xlog_recover_do_dquot_trans(
 	xlog_t			*log,
-	xlog_recover_item_t	*item)
+	xlog_recover_item_t	*item,
+	int			pass)
 {
-	xfs_mount_t		*mp = log->l_mp;
+	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	struct xfs_disk_dquot	*ddq, *recddq;
 	int			error;
 	xfs_dq_logformat_t	*dq_f;
 	uint			type;
 
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+	mp = log->l_mp;
 
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
@@ -2501,7 +2647,7 @@ xlog_recover_dquot_pass2(
 	if ((error = xfs_qm_dqcheck(recddq,
 			   dq_f->qlf_id,
 			   0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_dquot_pass2 (log copy)"))) {
+			   "xlog_recover_do_dquot_trans (log copy)"))) {
 		return XFS_ERROR(EIO);
 	}
 	ASSERT(dq_f->qlf_len == 1);
@@ -2524,7 +2670,7 @@ xlog_recover_dquot_pass2(
 	 * minimal initialization then.
 	 */
 	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_dquot_pass2")) {
+			   "xlog_recover_do_dquot_trans")) {
 		xfs_buf_relse(bp);
 		return XFS_ERROR(EIO);
 	}
@@ -2547,31 +2693,38 @@ xlog_recover_dquot_pass2(
  * LSN.
  */
 STATIC int
-xlog_recover_efi_pass2(
+xlog_recover_do_efi_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
-	xfs_lsn_t		lsn)
+	xfs_lsn_t		lsn,
+	int			pass)
 {
 	int			error;
-	xfs_mount_t		*mp = log->l_mp;
+	xfs_mount_t		*mp;
 	xfs_efi_log_item_t	*efip;
 	xfs_efi_log_format_t	*efi_formatp;
 
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+
 	efi_formatp = item->ri_buf[0].i_addr;
 
+	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
 	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
 					 &(efip->efi_format)))) {
 		xfs_efi_item_free(efip);
 		return error;
 	}
-	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+	efip->efi_next_extent = efi_formatp->efi_nextents;
+	efip->efi_flags |= XFS_EFI_COMMITTED;
 
 	spin_lock(&log->l_ailp->xa_lock);
 	/*
 	 * xfs_trans_ail_update() drops the AIL lock.
 	 */
-	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+	xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
 	return 0;
 }
 
@@ -2584,10 +2737,11 @@ xlog_recover_efi_pass2(
  * efd format structure.  If we find it, we remove the efi from the
  * AIL and free it.
  */
-STATIC int
-xlog_recover_efd_pass2(
+STATIC void
+xlog_recover_do_efd_trans(
 	xlog_t			*log,
-	xlog_recover_item_t	*item)
+	xlog_recover_item_t	*item,
+	int			pass)
 {
 	xfs_efd_log_format_t	*efd_formatp;
 	xfs_efi_log_item_t	*efip = NULL;
@@ -2596,6 +2750,10 @@ xlog_recover_efd_pass2(
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp = log->l_ailp;
 
+	if (pass == XLOG_RECOVER_PASS1) {
+		return;
+	}
+
 	efd_formatp = item->ri_buf[0].i_addr;
 	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2627,6 +2785,62 @@ xlog_recover_efd_pass2(
 	}
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
+}
+
+/*
+ * Perform the transaction
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_do_trans(
+	xlog_t			*log,
+	xlog_recover_t		*trans,
+	int			pass)
+{
+	int			error = 0;
+	xlog_recover_item_t	*item;
+
+	error = xlog_recover_reorder_trans(log, trans, pass);
+	if (error)
+		return error;
+
+	list_for_each_entry(item, &trans->r_itemq, ri_list) {
+		trace_xfs_log_recover_item_recover(log, trans, item, pass);
+		switch (ITEM_TYPE(item)) {
+		case XFS_LI_BUF:
+			error = xlog_recover_do_buffer_trans(log, item, pass);
+			break;
+		case XFS_LI_INODE:
+			error = xlog_recover_do_inode_trans(log, item, pass);
+			break;
+		case XFS_LI_EFI:
+			error = xlog_recover_do_efi_trans(log, item,
+							  trans->r_lsn, pass);
+			break;
+		case XFS_LI_EFD:
+			xlog_recover_do_efd_trans(log, item, pass);
+			error = 0;
+			break;
+		case XFS_LI_DQUOT:
+			error = xlog_recover_do_dquot_trans(log, item, pass);
+			break;
+		case XFS_LI_QUOTAOFF:
+			error = xlog_recover_do_quotaoff_trans(log, item,
+							       pass);
+			break;
+		default:
+			xlog_warn(
+	"XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
+			ASSERT(0);
+			error = XFS_ERROR(EIO);
+			break;
+		}
+
+		if (error)
+			return error;
+	}
 
 	return 0;
 }
@@ -2638,7 +2852,7 @@ xlog_recover_efd_pass2(
  */
 STATIC void
 xlog_recover_free_trans(
-	struct xlog_recover	*trans)
+	xlog_recover_t		*trans)
 {
 	xlog_recover_item_t	*item, *n;
 	int			i;
@@ -2656,96 +2870,18 @@ xlog_recover_free_trans(
 	kmem_free(trans);
 }
 
-STATIC int
-xlog_recover_commit_pass1(
-	struct log		*log,
-	struct xlog_recover	*trans,
-	xlog_recover_item_t	*item)
-{
-	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
-
-	switch (ITEM_TYPE(item)) {
-	case XFS_LI_BUF:
-		return xlog_recover_buffer_pass1(log, item);
-	case XFS_LI_QUOTAOFF:
-		return xlog_recover_quotaoff_pass1(log, item);
-	case XFS_LI_INODE:
-	case XFS_LI_EFI:
-	case XFS_LI_EFD:
-	case XFS_LI_DQUOT:
-		/* nothing to do in pass 1 */
-		return 0;
-	default:
-		xlog_warn(
-	"XFS: invalid item type (%d) xlog_recover_commit_pass1",
-			ITEM_TYPE(item));
-		ASSERT(0);
-		return XFS_ERROR(EIO);
-	}
-}
-
-STATIC int
-xlog_recover_commit_pass2(
-	struct log		*log,
-	struct xlog_recover	*trans,
-	xlog_recover_item_t	*item)
-{
-	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
-
-	switch (ITEM_TYPE(item)) {
-	case XFS_LI_BUF:
-		return xlog_recover_buffer_pass2(log, item);
-	case XFS_LI_INODE:
-		return xlog_recover_inode_pass2(log, item);
-	case XFS_LI_EFI:
-		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
-	case XFS_LI_EFD:
-		return xlog_recover_efd_pass2(log, item);
-	case XFS_LI_DQUOT:
-		return xlog_recover_dquot_pass2(log, item);
-	case XFS_LI_QUOTAOFF:
-		/* nothing to do in pass2 */
-		return 0;
-	default:
-		xlog_warn(
-	"XFS: invalid item type (%d) xlog_recover_commit_pass2",
-			ITEM_TYPE(item));
-		ASSERT(0);
-		return XFS_ERROR(EIO);
-	}
-}
-
-/*
- * Perform the transaction.
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
 STATIC int
 xlog_recover_commit_trans(
-	struct log		*log,
-	struct xlog_recover	*trans,
+	xlog_t			*log,
+	xlog_recover_t		*trans,
 	int			pass)
 {
-	int			error = 0;
-	xlog_recover_item_t	*item;
+	int			error;
 
 	hlist_del(&trans->r_list);
-
-	error = xlog_recover_reorder_trans(log, trans, pass);
-	if (error)
+	if ((error = xlog_recover_do_trans(log, trans, pass)))
 		return error;
-
-	list_for_each_entry(item, &trans->r_itemq, ri_list) {
-		if (pass == XLOG_RECOVER_PASS1)
-			error = xlog_recover_commit_pass1(log, trans, item);
-		else
-			error = xlog_recover_commit_pass2(log, trans, item);
-		if (error)
-			return error;
-	}
-
-	xlog_recover_free_trans(trans);
+	xlog_recover_free_trans(trans);			/* no error */
 	return 0;
 }
 
@@ -2875,7 +3011,7 @@ xlog_recover_process_efi(
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
 
-	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
 
 	/*
 	 * First check the validity of the extents described by the
@@ -2914,7 +3050,7 @@ xlog_recover_process_efi(
 					 extp->ext_len);
 	}
 
-	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+	efip->efi_flags |= XFS_EFI_RECOVERED;
 	error = xfs_trans_commit(tp, 0);
 	return error;
 
@@ -2971,7 +3107,7 @@ xlog_recover_process_efis(
 		 * Skip EFIs that we've already processed.
 		 */
 		efip = (xfs_efi_log_item_t *)lip;
-		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
+		if (efip->efi_flags & XFS_EFI_RECOVERED) {
 			lip = xfs_trans_ail_cursor_next(ailp, &cur);
 			continue;
 		}
@@ -3588,7 +3724,7 @@ xlog_do_log_recovery(
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
-	int		error, i;
+	int		error;
 
 	ASSERT(head_blk != tail_blk);
 
@@ -3596,12 +3732,10 @@ xlog_do_log_recovery(
 	 * First do a pass to find all of the cancelled buf log items.
 	 * Store them in the buf_cancel_table for use in the second pass.
 	 */
-	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-						 sizeof(struct list_head),
+	log->l_buf_cancel_table =
+		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(xfs_buf_cancel_t*),
 						 KM_SLEEP);
-	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
-
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1);
 	if (error != 0) {
@@ -3620,7 +3754,7 @@ xlog_do_log_recovery(
 		int	i;
 
 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+			ASSERT(log->l_buf_cancel_table[i] == NULL);
 	}
 #endif	/* DEBUG */
 
diff --git a/trunk/fs/xfs/xfs_mount.c b/trunk/fs/xfs/xfs_mount.c
index d447aef84bc3..19e9dfa1c254 100644
--- a/trunk/fs/xfs/xfs_mount.c
+++ b/trunk/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
 			goto out_unwind;
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
-		spin_lock_init(&pag->pag_ici_lock);
+		rwlock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		spin_lock_init(&pag->pag_buf_lock);
@@ -974,24 +974,6 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
 }
 
-/*
- * precalculate the low space thresholds for dynamic speculative preallocation.
- */
-void
-xfs_set_low_space_thresholds(
-	struct xfs_mount	*mp)
-{
-	int i;
-
-	for (i = 0; i < XFS_LOWSP_MAX; i++) {
-		__uint64_t space = mp->m_sb.sb_dblocks;
-
-		do_div(space, 100);
-		mp->m_low_space[i] = space * (i + 1);
-	}
-}
-
-
 /*
  * Set whether we're using inode alignment.
  */
@@ -1214,9 +1196,6 @@ xfs_mountfs(
 	 */
 	xfs_set_rw_sizes(mp);
 
-	/* set the low space thresholds for dynamic preallocation */
-	xfs_set_low_space_thresholds(mp);
-
 	/*
 	 * Set the inode cluster size.
 	 * This may still be overridden by the file system
diff --git a/trunk/fs/xfs/xfs_mount.h b/trunk/fs/xfs/xfs_mount.h
index a62e8971539d..5861b4980740 100644
--- a/trunk/fs/xfs/xfs_mount.h
+++ b/trunk/fs/xfs/xfs_mount.h
@@ -103,16 +103,6 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
 	xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
 
-/* dynamic preallocation free space thresholds, 5% down to 1% */
-enum {
-	XFS_LOWSP_1_PCNT = 0,
-	XFS_LOWSP_2_PCNT,
-	XFS_LOWSP_3_PCNT,
-	XFS_LOWSP_4_PCNT,
-	XFS_LOWSP_5_PCNT,
-	XFS_LOWSP_MAX,
-};
-
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
@@ -212,8 +202,6 @@ typedef struct xfs_mount {
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
-	int64_t			m_low_space[XFS_LOWSP_MAX];
-						/* low free space thresholds */
 } xfs_mount_t;
 
 /*
@@ -391,8 +379,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
 extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 
-extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
-
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/trunk/fs/xfs/xfs_trans.c b/trunk/fs/xfs/xfs_trans.c
index f80a067a4658..f6d956b7711e 100644
--- a/trunk/fs/xfs/xfs_trans.c
+++ b/trunk/fs/xfs/xfs_trans.c
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
  * they could be immediately flushed and we'd have to race with the flusher
  * trying to pull the item from the AIL as we add it.
  */
-static void
+void
 xfs_trans_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		commit_lsn,
@@ -1425,83 +1425,6 @@ xfs_trans_committed(
 	xfs_trans_free(tp);
 }
 
-static inline void
-xfs_log_item_batch_insert(
-	struct xfs_ail		*ailp,
-	struct xfs_log_item	**log_items,
-	int			nr_items,
-	xfs_lsn_t		commit_lsn)
-{
-	int	i;
-
-	spin_lock(&ailp->xa_lock);
-	/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
-	xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
-
-	for (i = 0; i < nr_items; i++)
-		IOP_UNPIN(log_items[i], 0);
-}
-
-/*
- * Bulk operation version of xfs_trans_committed that takes a log vector of
- * items to insert into the AIL. This uses bulk AIL insertion techniques to
- * minimise lock traffic.
- */
-void
-xfs_trans_committed_bulk(
-	struct xfs_ail		*ailp,
-	struct xfs_log_vec	*log_vector,
-	xfs_lsn_t		commit_lsn,
-	int			aborted)
-{
-#define LOG_ITEM_BATCH_SIZE	32
-	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
-	struct xfs_log_vec	*lv;
-	int			i = 0;
-
-	/* unpin all the log items */
-	for (lv = log_vector; lv; lv = lv->lv_next ) {
-		struct xfs_log_item	*lip = lv->lv_item;
-		xfs_lsn_t		item_lsn;
-
-		if (aborted)
-			lip->li_flags |= XFS_LI_ABORTED;
-		item_lsn = IOP_COMMITTED(lip, commit_lsn);
-
-		/* item_lsn of -1 means the item was freed */
-		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
-			continue;
-
-		if (item_lsn != commit_lsn) {
-
-			/*
-			 * Not a bulk update option due to unusual item_lsn.
-			 * Push into AIL immediately, rechecking the lsn once
-			 * we have the ail lock. Then unpin the item.
-			 */
-			spin_lock(&ailp->xa_lock);
-			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
-				xfs_trans_ail_update(ailp, lip, item_lsn);
-			else
-				spin_unlock(&ailp->xa_lock);
-			IOP_UNPIN(lip, 0);
-			continue;
-		}
-
-		/* Item is a candidate for bulk AIL insert.  */
-		log_items[i++] = lv->lv_item;
-		if (i >= LOG_ITEM_BATCH_SIZE) {
-			xfs_log_item_batch_insert(ailp, log_items,
-					LOG_ITEM_BATCH_SIZE, commit_lsn);
-			i = 0;
-		}
-	}
-
-	/* make sure we insert the remainder! */
-	if (i)
-		xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
-}
-
 /*
  * Called from the trans_commit code when we notice that
  * the filesystem is in the middle of a forced shutdown.
diff --git a/trunk/fs/xfs/xfs_trans.h b/trunk/fs/xfs/xfs_trans.h
index c2042b736b81..246286b77a86 100644
--- a/trunk/fs/xfs/xfs_trans.h
+++ b/trunk/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define	XFS_ALLOC_BTREE_REF	2
 #define	XFS_BMAP_BTREE_REF	2
 #define	XFS_DIR_BTREE_REF	2
-#define	XFS_INO_REF		2
 #define	XFS_ATTR_BTREE_REF	1
+#define	XFS_INO_REF		1
 #define	XFS_DQUOT_REF		1
 
 #ifdef __KERNEL__
diff --git a/trunk/fs/xfs/xfs_trans_ail.c b/trunk/fs/xfs/xfs_trans_ail.c
index c5bbbc45db91..dc9069568ff7 100644
--- a/trunk/fs/xfs/xfs_trans_ail.c
+++ b/trunk/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
-STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 
@@ -449,153 +449,130 @@ xfs_trans_unlocked_item(
 		xfs_log_move_tail(ailp->xa_mount, 1);
 }	/* xfs_trans_unlocked_item */
 
+
 /*
- * xfs_trans_ail_update - bulk AIL insertion operation.
- *
- * @xfs_trans_ail_update takes an array of log items that all need to be
- * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
- * be added.  Otherwise, it will be repositioned  by removing it and re-adding
- * it to the AIL. If we move the first item in the AIL, update the log tail to
- * match the new minimum LSN in the AIL.
+ * Update the position of the item in the AIL with the new
+ * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ * it to its new position by removing it and re-adding it.
  *
- * This function takes the AIL lock once to execute the update operations on
- * all the items in the array, and as such should not be called with the AIL
- * lock held. As a result, once we have the AIL lock, we need to check each log
- * item LSN to confirm it needs to be moved forward in the AIL.
+ * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * we move in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
  *
- * To optimise the insert operation, we delete all the items from the AIL in
- * the first pass, moving them into a temporary list, then splice the temporary
- * list into the correct position in the AIL. This avoids needing to do an
- * insert operation on every item.
- *
- * This function must be called with the AIL lock held.  The lock is dropped
- * before returning.
+ * This function must be called with the AIL lock held.  The lock
+ * is dropped before returning.
  */
 void
-xfs_trans_ail_update_bulk(
-	struct xfs_ail		*ailp,
-	struct xfs_log_item	**log_items,
-	int			nr_items,
-	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
+xfs_trans_ail_update(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip,
+	xfs_lsn_t	lsn) __releases(ailp->xa_lock)
 {
-	xfs_log_item_t		*mlip;
+	xfs_log_item_t		*dlip = NULL;
+	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
 	xfs_lsn_t		tail_lsn;
-	int			mlip_changed = 0;
-	int			i;
-	LIST_HEAD(tmp);
 
 	mlip = xfs_ail_min(ailp);
 
-	for (i = 0; i < nr_items; i++) {
-		struct xfs_log_item *lip = log_items[i];
-		if (lip->li_flags & XFS_LI_IN_AIL) {
-			/* check if we really need to move the item */
-			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-				continue;
-
-			xfs_ail_delete(ailp, lip);
-			if (mlip == lip)
-				mlip_changed = 1;
-		} else {
-			lip->li_flags |= XFS_LI_IN_AIL;
-		}
-		lip->li_lsn = lsn;
-		list_add(&lip->li_ail, &tmp);
+	if (lip->li_flags & XFS_LI_IN_AIL) {
+		dlip = xfs_ail_delete(ailp, lip);
+		ASSERT(dlip == lip);
+		xfs_trans_ail_cursor_clear(ailp, dlip);
+	} else {
+		lip->li_flags |= XFS_LI_IN_AIL;
 	}
 
-	xfs_ail_splice(ailp, &tmp, lsn);
+	lip->li_lsn = lsn;
+	xfs_ail_insert(ailp, lip);
 
-	if (!mlip_changed) {
+	if (mlip == dlip) {
+		mlip = xfs_ail_min(ailp);
+		/*
+		 * It is not safe to access mlip after the AIL lock is
+		 * dropped, so we must get a copy of li_lsn before we do
+		 * so.  This is especially important on 32-bit platforms
+		 * where accessing and updating 64-bit values like li_lsn
+		 * is not atomic.
+		 */
+		tail_lsn = mlip->li_lsn;
+		spin_unlock(&ailp->xa_lock);
+		xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+	} else {
 		spin_unlock(&ailp->xa_lock);
-		return;
 	}
 
-	/*
-	 * It is not safe to access mlip after the AIL lock is dropped, so we
-	 * must get a copy of li_lsn before we do so.  This is especially
-	 * important on 32-bit platforms where accessing and updating 64-bit
-	 * values like li_lsn is not atomic.
-	 */
-	mlip = xfs_ail_min(ailp);
-	tail_lsn = mlip->li_lsn;
-	spin_unlock(&ailp->xa_lock);
-	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-}
+
+}	/* xfs_trans_update_ail */
 
 /*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+ * Delete the given item from the AIL.  It must already be in
+ * the AIL.
  *
- * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * removed from the AIL. The caller is already holding the AIL lock, and done
- * all the checks necessary to ensure the items passed in via @log_items are
- * ready for deletion. This includes checking that the items are in the AIL.
+ * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * we delete in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
  *
- * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * flag from the item and reset the item's lsn to 0. If we remove the first
- * item in the AIL, update the log tail to match the new minimum LSN in the
- * AIL.
+ * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * bump the AIL's generation count to indicate that the tree
+ * has changed.
  *
- * This function will not drop the AIL lock until all items are removed from
- * the AIL to minimise the amount of lock traffic on the AIL. This does not
- * greatly increase the AIL hold time, but does significantly reduce the amount
- * of traffic on the lock, especially during IO completion.
- *
- * This function must be called with the AIL lock held.  The lock is dropped
- * before returning.
+ * This function must be called with the AIL lock held.  The lock
+ * is dropped before returning.
  */
 void
-xfs_trans_ail_delete_bulk(
-	struct xfs_ail		*ailp,
-	struct xfs_log_item	**log_items,
-	int			nr_items) __releases(ailp->xa_lock)
+xfs_trans_ail_delete(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
 {
+	xfs_log_item_t		*dlip;
 	xfs_log_item_t		*mlip;
 	xfs_lsn_t		tail_lsn;
-	int			mlip_changed = 0;
-	int			i;
 
-	mlip = xfs_ail_min(ailp);
-
-	for (i = 0; i < nr_items; i++) {
-		struct xfs_log_item *lip = log_items[i];
-		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
-			struct xfs_mount	*mp = ailp->xa_mount;
+	if (lip->li_flags & XFS_LI_IN_AIL) {
+		mlip = xfs_ail_min(ailp);
+		dlip = xfs_ail_delete(ailp, lip);
+		ASSERT(dlip == lip);
+		xfs_trans_ail_cursor_clear(ailp, dlip);
 
-			spin_unlock(&ailp->xa_lock);
-			if (!XFS_FORCED_SHUTDOWN(mp)) {
-				xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-		"%s: attempting to delete a log item that is not in the AIL",
-						__func__);
-				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-			}
-			return;
-		}
 
-		xfs_ail_delete(ailp, lip);
 		lip->li_flags &= ~XFS_LI_IN_AIL;
 		lip->li_lsn = 0;
-		if (mlip == lip)
-			mlip_changed = 1;
+
+		if (mlip == dlip) {
+			mlip = xfs_ail_min(ailp);
+			/*
+			 * It is not safe to access mlip after the AIL lock
+			 * is dropped, so we must get a copy of li_lsn
+			 * before we do so.  This is especially important
+			 * on 32-bit platforms where accessing and updating
+			 * 64-bit values like li_lsn is not atomic.
+			 */
+			tail_lsn = mlip ? mlip->li_lsn : 0;
+			spin_unlock(&ailp->xa_lock);
+			xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+		} else {
+			spin_unlock(&ailp->xa_lock);
+		}
 	}
+	else {
+		/*
+		 * If the file system is not being shutdown, we are in
+		 * serious trouble if we get to this stage.
+		 */
+		struct xfs_mount	*mp = ailp->xa_mount;
 
-	if (!mlip_changed) {
 		spin_unlock(&ailp->xa_lock);
-		return;
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+		"%s: attempting to delete a log item that is not in the AIL",
+					__func__);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		}
 	}
-
-	/*
-	 * It is not safe to access mlip after the AIL lock is dropped, so we
-	 * must get a copy of li_lsn before we do so.  This is especially
-	 * important on 32-bit platforms where accessing and updating 64-bit
-	 * values like li_lsn is not atomic. It is possible we've emptied the
-	 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
-	 */
-	mlip = xfs_ail_min(ailp);
-	tail_lsn = mlip ? mlip->li_lsn : 0;
-	spin_unlock(&ailp->xa_lock);
-	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
 }
 
+
+
 /*
  * The active item list (AIL) is a doubly linked list of log
  * items sorted by ascending lsn.  The base of the list is
@@ -646,13 +623,16 @@ xfs_trans_ail_destroy(
 }
 
 /*
- * splice the log item list into the AIL at the given LSN.
+ * Insert the given log item into the AIL.
+ * We almost always insert at the end of the list, so on inserts
+ * we search from the end of the list to find where the
+ * new item belongs.
  */
 STATIC void
-xfs_ail_splice(
+xfs_ail_insert(
 	struct xfs_ail	*ailp,
-	struct list_head *list,
-	xfs_lsn_t	lsn)
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
 {
 	xfs_log_item_t	*next_lip;
 
@@ -660,33 +640,39 @@ xfs_ail_splice(
 	 * If the list is empty, just insert the item.
 	 */
 	if (list_empty(&ailp->xa_ail)) {
-		list_splice(list, &ailp->xa_ail);
+		list_add(&lip->li_ail, &ailp->xa_ail);
 		return;
 	}
 
 	list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-		if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
+		if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
 			break;
 	}
 
 	ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-	       (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
+	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+
+	list_add(&lip->li_ail, &next_lip->li_ail);
 
-	list_splice_init(list, &next_lip->li_ail);
+	xfs_ail_check(ailp, lip);
 	return;
 }
 
 /*
  * Delete the given item from the AIL.  Return a pointer to the item.
  */
-STATIC void
+/*ARGSUSED*/
+STATIC xfs_log_item_t *
 xfs_ail_delete(
 	struct xfs_ail	*ailp,
 	xfs_log_item_t	*lip)
+/* ARGSUSED */
 {
 	xfs_ail_check(ailp, lip);
+
 	list_del(&lip->li_ail);
-	xfs_trans_ail_cursor_clear(ailp, lip);
+
+	return lip;
 }
 
 /*
@@ -696,6 +682,7 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
 	struct xfs_ail	*ailp)
+/* ARGSUSED */
 {
 	if (list_empty(&ailp->xa_ail))
 		return NULL;
@@ -712,6 +699,7 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
 	struct xfs_ail	*ailp,
 	xfs_log_item_t	*lip)
+/* ARGSUSED */
 {
 	if (lip->li_ail.next == &ailp->xa_ail)
 		return NULL;
diff --git a/trunk/fs/xfs/xfs_trans_extfree.c b/trunk/fs/xfs/xfs_trans_extfree.c
index f7590f5badea..f783d5e9fa70 100644
--- a/trunk/fs/xfs/xfs_trans_extfree.c
+++ b/trunk/fs/xfs/xfs_trans_extfree.c
@@ -69,16 +69,12 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
-	/*
-	 * atomic_inc_return gives us the value after the increment;
-	 * we want to use it as an array index so we need to subtract 1 from
-	 * it.
-	 */
-	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+	next_extent = efip->efi_next_extent;
 	ASSERT(next_extent < efip->efi_format.efi_nextents);
 	extp = &(efip->efi_format.efi_extents[next_extent]);
 	extp->ext_start = start_block;
 	extp->ext_len = ext_len;
+	efip->efi_next_extent++;
 }
 
 
diff --git a/trunk/fs/xfs/xfs_trans_priv.h b/trunk/fs/xfs/xfs_trans_priv.h
index 35162c238fa3..62da86c90de5 100644
--- a/trunk/fs/xfs/xfs_trans_priv.h
+++ b/trunk/fs/xfs/xfs_trans_priv.h
@@ -22,17 +22,15 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
-struct xfs_ail;
-struct xfs_log_vec;
 
 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
 void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
 				int flags);
+void	xfs_trans_item_committed(struct xfs_log_item *lip,
+				xfs_lsn_t commit_lsn, int aborted);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
-void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
-				xfs_lsn_t commit_lsn, int aborted);
 /*
  * AIL traversal cursor.
  *
@@ -75,29 +73,12 @@ struct xfs_ail {
 /*
  * From xfs_trans_ail.c
  */
-void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-				struct xfs_log_item **log_items, int nr_items,
-				xfs_lsn_t lsn) __releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_update(
-	struct xfs_ail		*ailp,
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
-{
-	xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
-}
-
-void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
-				struct xfs_log_item **log_items, int nr_items)
-				__releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
-{
-	xfs_trans_ail_delete_bulk(ailp, &lip, 1);
-}
-
+void			xfs_trans_ail_update(struct xfs_ail *ailp,
+					struct xfs_log_item *lip, xfs_lsn_t lsn)
+					__releases(ailp->xa_lock);
+void			xfs_trans_ail_delete(struct xfs_ail *ailp,
+					struct xfs_log_item *lip)
+					__releases(ailp->xa_lock);
 void			xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void			xfs_trans_unlocked_item(struct xfs_ail *,
 					xfs_log_item_t *);
diff --git a/trunk/fs/xfs/xfs_vnodeops.c b/trunk/fs/xfs/xfs_vnodeops.c
index d8e6f8cd6f0c..8e4a63c4151a 100644
--- a/trunk/fs/xfs/xfs_vnodeops.c
+++ b/trunk/fs/xfs/xfs_vnodeops.c
@@ -964,48 +964,29 @@ xfs_release(
 			xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 	}
 
-	if (ip->i_d.di_nlink == 0)
-		return 0;
-
-	if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-	       ip->i_delayed_blks > 0)) &&
-	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-
-		/*
-		 * If we can't get the iolock just skip truncating the blocks
-		 * past EOF because we could deadlock with the mmap_sem
-		 * otherwise.  We'll get another chance to drop them once the
-		 * last reference to the inode is dropped, so we'll never leak
-		 * blocks permanently.
-		 *
-		 * Further, check if the inode is being opened, written and
-		 * closed frequently and we have delayed allocation blocks
-		 * oustanding (e.g. streaming writes from the NFS server),
-		 * truncating the blocks past EOF will cause fragmentation to
-		 * occur.
-		 *
-		 * In this case don't do the truncation, either, but we have to
-		 * be careful how we detect this case. Blocks beyond EOF show
-		 * up as i_delayed_blks even when the inode is clean, so we
-		 * need to truncate them away first before checking for a dirty
-		 * release. Hence on the first dirty close we will still remove
-		 * the speculative allocation, but after that we will leave it
-		 * in place.
-		 */
-		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
-			return 0;
-
-		error = xfs_free_eofblocks(mp, ip,
-					   XFS_FREE_EOF_TRYLOCK);
-		if (error)
-			return error;
+	if (ip->i_d.di_nlink != 0) {
+		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+		     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+		       ip->i_delayed_blks > 0)) &&
+		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+		    (!(ip->i_d.di_flags &
+				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
-		/* delalloc blocks after truncation means it really is dirty */
-		if (ip->i_delayed_blks)
-			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+			/*
+			 * If we can't get the iolock just skip truncating
+			 * the blocks past EOF because we could deadlock
+			 * with the mmap_sem otherwise.  We'll get another
+			 * chance to drop them once the last reference to
+			 * the inode is dropped, so we'll never leak blocks
+			 * permanently.
+			 */
+			error = xfs_free_eofblocks(mp, ip,
+						   XFS_FREE_EOF_TRYLOCK);
+			if (error)
+				return error;
+		}
 	}
+
 	return 0;
 }
 
diff --git a/trunk/include/linux/dynamic_debug.h b/trunk/include/linux/dynamic_debug.h
index 1c70028f81f9..a90b3892074a 100644
--- a/trunk/include/linux/dynamic_debug.h
+++ b/trunk/include/linux/dynamic_debug.h
@@ -44,24 +44,34 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 extern int ddebug_remove_module(const char *mod_name);
 
 #define dynamic_pr_debug(fmt, ...) do {					\
+	__label__ do_printk;						\
+	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
 		_DPRINTK_FLAGS_DEFAULT };				\
-	if (unlikely(descriptor.enabled))				\
-		printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);		\
+	JUMP_LABEL(&descriptor.enabled, do_printk);			\
+	goto out;							\
+do_printk:								\
+	printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);			\
+out:	;								\
 	} while (0)
 
 
 #define dynamic_dev_dbg(dev, fmt, ...) do {				\
+	__label__ do_printk;						\
+	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
 		_DPRINTK_FLAGS_DEFAULT };				\
-	if (unlikely(descriptor.enabled))				\
-		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
+	JUMP_LABEL(&descriptor.enabled, do_printk);			\
+	goto out;							\
+do_printk:								\
+	dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);		\
+out:	;								\
 	} while (0)
 
 #else
diff --git a/trunk/include/linux/mfd/tmio.h b/trunk/include/linux/mfd/tmio.h
index 8e70310ee945..085f041197dc 100644
--- a/trunk/include/linux/mfd/tmio.h
+++ b/trunk/include/linux/mfd/tmio.h
@@ -57,10 +57,6 @@
  * is configured in 4-bit mode.
  */
 #define TMIO_MMC_BLKSZ_2BYTES		(1 << 1)
-/*
- * Some controllers can support SDIO IRQ signalling.
- */
-#define TMIO_MMC_SDIO_IRQ		(1 << 2)
 
 int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base);
 int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base);
@@ -70,7 +66,6 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
 struct tmio_mmc_dma {
 	void *chan_priv_tx;
 	void *chan_priv_rx;
-	int alignment_shift;
 };
 
 /*
diff --git a/trunk/include/linux/mmc/dw_mmc.h b/trunk/include/linux/mmc/dw_mmc.h
deleted file mode 100644
index 16b0261763ed..000000000000
--- a/trunk/include/linux/mmc/dw_mmc.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Synopsys DesignWare Multimedia Card Interface driver
- *  (Based on NXP driver for lpc 31xx)
- *
- * Copyright (C) 2009 NXP Semiconductors
- * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _LINUX_MMC_DW_MMC_H_
-#define _LINUX_MMC_DW_MMC_H_
-
-#define MAX_MCI_SLOTS	2
-
-enum dw_mci_state {
-	STATE_IDLE = 0,
-	STATE_SENDING_CMD,
-	STATE_SENDING_DATA,
-	STATE_DATA_BUSY,
-	STATE_SENDING_STOP,
-	STATE_DATA_ERROR,
-};
-
-enum {
-	EVENT_CMD_COMPLETE = 0,
-	EVENT_XFER_COMPLETE,
-	EVENT_DATA_COMPLETE,
-	EVENT_DATA_ERROR,
-	EVENT_XFER_ERROR
-};
-
-struct mmc_data;
-
-/**
- * struct dw_mci - MMC controller state shared between all slots
- * @lock: Spinlock protecting the queue and associated data.
- * @regs: Pointer to MMIO registers.
- * @sg: Scatterlist entry currently being processed by PIO code, if any.
- * @pio_offset: Offset into the current scatterlist entry.
- * @cur_slot: The slot which is currently using the controller.
- * @mrq: The request currently being processed on @cur_slot,
- *	or NULL if the controller is idle.
- * @cmd: The command currently being sent to the card, or NULL.
- * @data: The data currently being transferred, or NULL if no data
- *	transfer is in progress.
- * @use_dma: Whether DMA channel is initialized or not.
- * @sg_dma: Bus address of DMA buffer.
- * @sg_cpu: Virtual address of DMA buffer.
- * @dma_ops: Pointer to platform-specific DMA callbacks.
- * @cmd_status: Snapshot of SR taken upon completion of the current
- *	command. Only valid when EVENT_CMD_COMPLETE is pending.
- * @data_status: Snapshot of SR taken upon completion of the current
- *	data transfer. Only valid when EVENT_DATA_COMPLETE or
- *	EVENT_DATA_ERROR is pending.
- * @stop_cmdr: Value to be loaded into CMDR when the stop command is
- *	to be sent.
- * @dir_status: Direction of current transfer.
- * @tasklet: Tasklet running the request state machine.
- * @card_tasklet: Tasklet handling card detect.
- * @pending_events: Bitmask of events flagged by the interrupt handler
- *	to be processed by the tasklet.
- * @completed_events: Bitmask of events which the state machine has
- *	processed.
- * @state: Tasklet state.
- * @queue: List of slots waiting for access to the controller.
- * @bus_hz: The rate of @mck in Hz. This forms the basis for MMC bus
- *	rate and timeout calculations.
- * @current_speed: Configured rate of the controller.
- * @num_slots: Number of slots available.
- * @pdev: Platform device associated with the MMC controller.
- * @pdata: Platform data associated with the MMC controller.
- * @slot: Slots sharing this MMC controller.
- * @data_shift: log2 of FIFO item size.
- * @push_data: Pointer to FIFO push function.
- * @pull_data: Pointer to FIFO pull function.
- * @quirks: Set of quirks that apply to specific versions of the IP.
- *
- * Locking
- * =======
- *
- * @lock is a softirq-safe spinlock protecting @queue as well as
- * @cur_slot, @mrq and @state. These must always be updated
- * at the same time while holding @lock.
- *
- * The @mrq field of struct dw_mci_slot is also protected by @lock,
- * and must always be written at the same time as the slot is added to
- * @queue.
- *
- * @pending_events and @completed_events are accessed using atomic bit
- * operations, so they don't need any locking.
- *
- * None of the fields touched by the interrupt handler need any
- * locking. However, ordering is important: Before EVENT_DATA_ERROR or
- * EVENT_DATA_COMPLETE is set in @pending_events, all data-related
- * interrupts must be disabled and @data_status updated with a
- * snapshot of SR. Similarly, before EVENT_CMD_COMPLETE is set, the
- * CMDRDY interupt must be disabled and @cmd_status updated with a
- * snapshot of SR, and before EVENT_XFER_COMPLETE can be set, the
- * bytes_xfered field of @data must be written. This is ensured by
- * using barriers.
- */
-struct dw_mci {
-	spinlock_t		lock;
-	void __iomem		*regs;
-
-	struct scatterlist	*sg;
-	unsigned int		pio_offset;
-
-	struct dw_mci_slot	*cur_slot;
-	struct mmc_request	*mrq;
-	struct mmc_command	*cmd;
-	struct mmc_data		*data;
-
-	/* DMA interface members*/
-	int			use_dma;
-
-	dma_addr_t		sg_dma;
-	void			*sg_cpu;
-	struct dw_mci_dma_ops	*dma_ops;
-#ifdef CONFIG_MMC_DW_IDMAC
-	unsigned int		ring_size;
-#else
-	struct dw_mci_dma_data	*dma_data;
-#endif
-	u32			cmd_status;
-	u32			data_status;
-	u32			stop_cmdr;
-	u32			dir_status;
-	struct tasklet_struct	tasklet;
-	struct tasklet_struct	card_tasklet;
-	unsigned long		pending_events;
-	unsigned long		completed_events;
-	enum dw_mci_state	state;
-	struct list_head	queue;
-
-	u32			bus_hz;
-	u32			current_speed;
-	u32			num_slots;
-	struct platform_device	*pdev;
-	struct dw_mci_board	*pdata;
-	struct dw_mci_slot	*slot[MAX_MCI_SLOTS];
-
-	/* FIFO push and pull */
-	int			data_shift;
-	void (*push_data)(struct dw_mci *host, void *buf, int cnt);
-	void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
-
-	/* Workaround flags */
-	u32			quirks;
-};
-
-/* DMA ops for Internal/External DMAC interface */
-struct dw_mci_dma_ops {
-	/* DMA Ops */
-	int (*init)(struct dw_mci *host);
-	void (*start)(struct dw_mci *host, unsigned int sg_len);
-	void (*complete)(struct dw_mci *host);
-	void (*stop)(struct dw_mci *host);
-	void (*cleanup)(struct dw_mci *host);
-	void (*exit)(struct dw_mci *host);
-};
-
-/* IP Quirks/flags. */
-/* No special quirks or flags to cater for */
-#define DW_MCI_QUIRK_NONE		0
-/* DTO fix for command transmission with IDMAC configured */
-#define DW_MCI_QUIRK_IDMAC_DTO		1
-/* delay needed between retries on some 2.11a implementations */
-#define DW_MCI_QUIRK_RETRY_DELAY	2
-/* High Speed Capable - Supports HS cards (upto 50MHz) */
-#define DW_MCI_QUIRK_HIGHSPEED		4
-
-
-struct dma_pdata;
-
-struct block_settings {
-	unsigned short	max_segs;	/* see blk_queue_max_segments */
-	unsigned int	max_blk_size;	/* maximum size of one mmc block */
-	unsigned int	max_blk_count;	/* maximum number of blocks in one req*/
-	unsigned int	max_req_size;	/* maximum number of bytes in one req*/
-	unsigned int	max_seg_size;	/* see blk_queue_max_segment_size */
-};
-
-/* Board platform data */
-struct dw_mci_board {
-	u32 num_slots;
-
-	u32 quirks; /* Workaround / Quirk flags */
-	unsigned int bus_hz; /* Bus speed */
-
-	/* delay in mS before detecting cards after interrupt */
-	u32 detect_delay_ms;
-
-	int (*init)(u32 slot_id, irq_handler_t , void *);
-	int (*get_ro)(u32 slot_id);
-	int (*get_cd)(u32 slot_id);
-	int (*get_ocr)(u32 slot_id);
-	int (*get_bus_wd)(u32 slot_id);
-	/*
-	 * Enable power to selected slot and set voltage to desired level.
-	 * Voltage levels are specified using MMC_VDD_xxx defines defined
-	 * in linux/mmc/host.h file.
-	 */
-	void (*setpower)(u32 slot_id, u32 volt);
-	void (*exit)(u32 slot_id);
-	void (*select_slot)(u32 slot_id);
-
-	struct dw_mci_dma_ops *dma_ops;
-	struct dma_pdata *data;
-	struct block_settings *blk_settings;
-};
-
-#endif /* _LINUX_MMC_DW_MMC_H_ */
diff --git a/trunk/include/linux/mmc/host.h b/trunk/include/linux/mmc/host.h
index bcb793ec7374..30f6fad99a58 100644
--- a/trunk/include/linux/mmc/host.h
+++ b/trunk/include/linux/mmc/host.h
@@ -131,9 +131,6 @@ struct mmc_host {
 	unsigned int		f_max;
 	unsigned int		f_init;
 	u32			ocr_avail;
-	u32			ocr_avail_sdio;	/* SDIO-specific OCR */
-	u32			ocr_avail_sd;	/* SD-specific OCR */
-	u32			ocr_avail_mmc;	/* MMC-specific OCR */
 	struct notifier_block	pm_notify;
 
 #define MMC_VDD_165_195		0x00000080	/* VDD voltage 1.65 - 1.95 */
@@ -172,20 +169,9 @@ struct mmc_host {
 #define MMC_CAP_1_2V_DDR	(1 << 12)	/* can support */
 						/* DDR mode at 1.2V */
 #define MMC_CAP_POWER_OFF_CARD	(1 << 13)	/* Can power off after boot */
-#define MMC_CAP_BUS_WIDTH_TEST	(1 << 14)	/* CMD14/CMD19 bus width ok */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
-#ifdef CONFIG_MMC_CLKGATE
-	int			clk_requests;	/* internal reference counter */
-	unsigned int		clk_delay;	/* number of MCI clk hold cycles */
-	bool			clk_gated;	/* clock gated */
-	struct work_struct	clk_gate_work; /* delayed clock gate */
-	unsigned int		clk_old;	/* old clock value cache */
-	spinlock_t		clk_lock;	/* lock for clk fields */
-	struct mutex		clk_gate_mutex;	/* mutex for clock gating */
-#endif
-
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
 	unsigned short		max_segs;	/* see blk_queue_max_segments */
@@ -321,10 +307,5 @@ static inline int mmc_card_is_removable(struct mmc_host *host)
 	return !(host->caps & MMC_CAP_NONREMOVABLE) && mmc_assume_removable;
 }
 
-static inline int mmc_card_is_powered_resumed(struct mmc_host *host)
-{
-	return host->pm_flags & MMC_PM_KEEP_POWER;
-}
-
 #endif
 
diff --git a/trunk/include/linux/mmc/mmc.h b/trunk/include/linux/mmc/mmc.h
index 612301f85d14..956fbd877692 100644
--- a/trunk/include/linux/mmc/mmc.h
+++ b/trunk/include/linux/mmc/mmc.h
@@ -40,9 +40,7 @@
 #define MMC_READ_DAT_UNTIL_STOP  11   /* adtc [31:0] dadr        R1  */
 #define MMC_STOP_TRANSMISSION    12   /* ac                      R1b */
 #define MMC_SEND_STATUS          13   /* ac   [31:16] RCA        R1  */
-#define MMC_BUS_TEST_R           14   /* adtc                    R1  */
 #define MMC_GO_INACTIVE_STATE    15   /* ac   [31:16] RCA            */
-#define MMC_BUS_TEST_W           19   /* adtc                    R1  */
 #define MMC_SPI_READ_OCR         58   /* spi                  spi_R3 */
 #define MMC_SPI_CRC_ON_OFF       59   /* spi  [0:0] flag      spi_R1 */
 
diff --git a/trunk/include/linux/mmc/sdhci.h b/trunk/include/linux/mmc/sdhci.h
index 83bd9f76709a..1fdc673f2396 100644
--- a/trunk/include/linux/mmc/sdhci.h
+++ b/trunk/include/linux/mmc/sdhci.h
@@ -83,8 +83,6 @@ struct sdhci_host {
 #define SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12		(1<<28)
 /* Controller doesn't have HISPD bit field in HI-SPEED SD card */
 #define SDHCI_QUIRK_NO_HISPD_BIT			(1<<29)
-/* Controller treats ADMA descriptors with length 0000h incorrectly */
-#define SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC		(1<<30)
 
 	int irq;		/* Device IRQ */
 	void __iomem *ioaddr;	/* Mapped address */
@@ -141,10 +139,6 @@ struct sdhci_host {
 
 	unsigned int caps;	/* Alternative capabilities */
 
-	unsigned int            ocr_avail_sdio;	/* OCR bit masks */
-	unsigned int            ocr_avail_sd;
-	unsigned int            ocr_avail_mmc;
-
 	unsigned long private[0] ____cacheline_aligned;
 };
 #endif /* __SDHCI_H */
diff --git a/trunk/include/linux/pci_ids.h b/trunk/include/linux/pci_ids.h
index ab47732d81e0..cb845c16ad7d 100644
--- a/trunk/include/linux/pci_ids.h
+++ b/trunk/include/linux/pci_ids.h
@@ -518,7 +518,6 @@
 #define PCI_DEVICE_ID_AMD_11H_NB_MISC	0x1303
 #define PCI_DEVICE_ID_AMD_11H_NB_LINK	0x1304
 #define PCI_DEVICE_ID_AMD_15H_NB_MISC	0x1603
-#define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
 #define PCI_DEVICE_ID_AMD_SCSI		0x2020
@@ -1651,11 +1650,6 @@
 #define PCI_DEVICE_ID_O2_6836		0x6836
 #define PCI_DEVICE_ID_O2_6812		0x6872
 #define PCI_DEVICE_ID_O2_6933		0x6933
-#define PCI_DEVICE_ID_O2_8120		0x8120
-#define PCI_DEVICE_ID_O2_8220		0x8220
-#define PCI_DEVICE_ID_O2_8221		0x8221
-#define PCI_DEVICE_ID_O2_8320		0x8320
-#define PCI_DEVICE_ID_O2_8321		0x8321
 
 #define PCI_VENDOR_ID_3DFX		0x121a
 #define PCI_DEVICE_ID_3DFX_VOODOO	0x0001
@@ -2369,8 +2363,6 @@
 #define PCI_DEVICE_ID_JMICRON_JMB38X_SD	0x2381
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MMC 0x2382
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MS	0x2383
-#define PCI_DEVICE_ID_JMICRON_JMB388_SD	0x2391
-#define PCI_DEVICE_ID_JMICRON_JMB388_ESD 0x2392
 
 #define PCI_VENDOR_ID_KORENIX		0x1982
 #define PCI_DEVICE_ID_KORENIX_JETCARDF0	0x1600
diff --git a/trunk/include/linux/rtc.h b/trunk/include/linux/rtc.h
index 3c995b4d742c..14dbc83ded20 100644
--- a/trunk/include/linux/rtc.h
+++ b/trunk/include/linux/rtc.h
@@ -107,17 +107,12 @@ extern int rtc_year_days(unsigned int day, unsigned int month, unsigned int year
 extern int rtc_valid_tm(struct rtc_time *tm);
 extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
 extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
-ktime_t rtc_tm_to_ktime(struct rtc_time tm);
-struct rtc_time rtc_ktime_to_tm(ktime_t kt);
-
 
 #include <linux/device.h>
 #include <linux/seq_file.h>
 #include <linux/cdev.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
-#include <linux/timerqueue.h>
-#include <linux/workqueue.h>
 
 extern struct class *rtc_class;
 
@@ -156,19 +151,7 @@ struct rtc_class_ops {
 };
 
 #define RTC_DEVICE_NAME_SIZE 20
-typedef struct rtc_task {
-	void (*func)(void *private_data);
-	void *private_data;
-} rtc_task_t;
-
-
-struct rtc_timer {
-	struct rtc_task	task;
-	struct timerqueue_node node;
-	ktime_t period;
-	int enabled;
-};
-
+struct rtc_task;
 
 /* flags */
 #define RTC_DEV_BUSY 0
@@ -196,13 +179,16 @@ struct rtc_device
 	spinlock_t irq_task_lock;
 	int irq_freq;
 	int max_user_freq;
-
-	struct timerqueue_head timerqueue;
-	struct rtc_timer aie_timer;
-	struct rtc_timer uie_rtctimer;
-	struct hrtimer pie_timer; /* sub second exp, so needs hrtimer */
-	int pie_enabled;
-	struct work_struct irqwork;
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	struct work_struct uie_task;
+	struct timer_list uie_timer;
+	/* Those fields are protected by rtc->irq_lock */
+	unsigned int oldsecs;
+	unsigned int uie_irq_active:1;
+	unsigned int stop_uie_polling:1;
+	unsigned int uie_task_active:1;
+	unsigned int uie_timer_active:1;
+#endif
 };
 #define to_rtc_device(d) container_of(d, struct rtc_device, dev)
 
@@ -238,22 +224,15 @@ extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled);
 extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc,
 						unsigned int enabled);
 
-void rtc_aie_update_irq(void *private);
-void rtc_uie_update_irq(void *private);
-enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer);
+typedef struct rtc_task {
+	void (*func)(void *private_data);
+	void *private_data;
+} rtc_task_t;
 
 int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
-void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
-void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
-void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
-int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
-			ktime_t expires, ktime_t period);
-int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer);
-void rtc_timer_do_work(struct work_struct *work);
-
 static inline bool is_leap_year(unsigned int year)
 {
 	return (!(year % 4) && (year % 100)) || !(year % 400);
diff --git a/trunk/include/linux/tracepoint.h b/trunk/include/linux/tracepoint.h
index c6814616653b..d3e4f87e95c0 100644
--- a/trunk/include/linux/tracepoint.h
+++ b/trunk/include/linux/tracepoint.h
@@ -32,7 +32,7 @@ struct tracepoint {
 	int state;			/* State. */
 	void (*regfunc)(void);
 	void (*unregfunc)(void);
-	struct tracepoint_func __rcu *funcs;
+	struct tracepoint_func *funcs;
 } __attribute__((aligned(32)));		/*
 					 * Aligned on 32 bytes because it is
 					 * globally visible and gcc happily
@@ -326,7 +326,7 @@ do_trace:								\
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
  *		__entry->next_prio	= next->prio;
- *	),
+ *	)
  *
  *	*
  *	* Formatted output of a trace record via TP_printk().
diff --git a/trunk/include/trace/define_trace.h b/trunk/include/trace/define_trace.h
index da39b22636f7..b0b4eb24d592 100644
--- a/trunk/include/trace/define_trace.h
+++ b/trunk/include/trace/define_trace.h
@@ -21,16 +21,6 @@
 #undef CREATE_TRACE_POINTS
 
 #include <linux/stringify.h>
-/*
- * module.h includes tracepoints, and because ftrace.h
- * pulls in module.h:
- *  trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h ->
- *  linux/ftrace.h -> linux/module.h
- * we must include module.h here before we play with any of
- * the TRACE_EVENT() macros, otherwise the tracepoints included
- * by module.h may break the build.
- */
-#include <linux/module.h>
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
diff --git a/trunk/include/trace/events/skb.h b/trunk/include/trace/events/skb.h
index f10293c41b1e..75ce9d500d8e 100644
--- a/trunk/include/trace/events/skb.h
+++ b/trunk/include/trace/events/skb.h
@@ -25,7 +25,9 @@ TRACE_EVENT(kfree_skb,
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
-		__entry->protocol = ntohs(skb->protocol);
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
 		__entry->location = location;
 	),
 
diff --git a/trunk/kernel/Makefile b/trunk/kernel/Makefile
index 5669f71dfdd5..33e0a39cf359 100644
--- a/trunk/kernel/Makefile
+++ b/trunk/kernel/Makefile
@@ -100,7 +100,6 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
-obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c
index f9a45ebcc7b1..89c74861a3da 100644
--- a/trunk/kernel/exit.c
+++ b/trunk/kernel/exit.c
@@ -994,15 +994,6 @@ NORET_TYPE void do_exit(long code)
 	exit_fs(tsk);
 	check_stack_usage();
 	exit_thread();
-
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 *
-	 * because of cgroup mode, must be called before cgroup_exit()
-	 */
-	perf_event_exit_task(tsk);
-
 	cgroup_exit(tsk, 1);
 
 	if (group_dead)
@@ -1016,6 +1007,11 @@ NORET_TYPE void do_exit(long code)
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_event_exit_task(tsk);
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/trunk/kernel/perf_event.c b/trunk/kernel/perf_event.c
index b782b7a79f00..11847bf1e8cc 100644
--- a/trunk/kernel/perf_event.c
+++ b/trunk/kernel/perf_event.c
@@ -38,12 +38,6 @@
 
 #include <asm/irq_regs.h>
 
-enum event_type_t {
-	EVENT_FLEXIBLE = 0x1,
-	EVENT_PINNED = 0x2,
-	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
 atomic_t perf_task_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -71,12 +65,6 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_event_id;
 
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type);
-
 void __weak perf_event_print_debug(void)	{ }
 
 extern __weak const char *perf_pmu_name(void)
@@ -84,11 +72,6 @@ extern __weak const char *perf_pmu_name(void)
 	return "pmu";
 }
 
-static inline u64 perf_clock(void)
-{
-	return local_clock();
-}
-
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -257,6 +240,11 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 	put_ctx(ctx);
 }
 
+static inline u64 perf_clock(void)
+{
+	return local_clock();
+}
+
 /*
  * Update the record of the current time in a context.
  */
@@ -268,12 +256,6 @@ static void update_context_time(struct perf_event_context *ctx)
 	ctx->timestamp = now;
 }
 
-static u64 perf_event_time(struct perf_event *event)
-{
-	struct perf_event_context *ctx = event->ctx;
-	return ctx ? ctx->time : 0;
-}
-
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
  */
@@ -287,7 +269,7 @@ static void update_event_times(struct perf_event *event)
 		return;
 
 	if (ctx->is_active)
-		run_end = perf_event_time(event);
+		run_end = ctx->time;
 	else
 		run_end = event->tstamp_stopped;
 
@@ -296,7 +278,7 @@ static void update_event_times(struct perf_event *event)
 	if (event->state == PERF_EVENT_STATE_INACTIVE)
 		run_end = event->tstamp_stopped;
 	else
-		run_end = perf_event_time(event);
+		run_end = ctx->time;
 
 	event->total_time_running = run_end - event->tstamp_running;
 }
@@ -552,7 +534,6 @@ event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
 	u64 delta;
 	/*
 	 * An event which could not be activated because of
@@ -564,7 +545,7 @@ event_sched_out(struct perf_event *event,
 	    && !event_filter_match(event)) {
 		delta = ctx->time - event->tstamp_stopped;
 		event->tstamp_running += delta;
-		event->tstamp_stopped = tstamp;
+		event->tstamp_stopped = ctx->time;
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -575,7 +556,7 @@ event_sched_out(struct perf_event *event,
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = tstamp;
+	event->tstamp_stopped = ctx->time;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
@@ -787,8 +768,6 @@ event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
-
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
@@ -805,9 +784,9 @@ event_sched_in(struct perf_event *event,
 		return -EAGAIN;
 	}
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
+	event->tstamp_running += ctx->time - event->tstamp_stopped;
 
-	event->shadow_ctx_time = tstamp - ctx->timestamp;
+	event->shadow_ctx_time = ctx->time - ctx->timestamp;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -919,13 +898,11 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
-
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	event->tstamp_enabled = tstamp;
-	event->tstamp_running = tstamp;
-	event->tstamp_stopped = tstamp;
+	event->tstamp_enabled = ctx->time;
+	event->tstamp_running = ctx->time;
+	event->tstamp_stopped = ctx->time;
 }
 
 /*
@@ -960,7 +937,7 @@ static void __perf_install_in_context(void *info)
 
 	add_event_to_ctx(event, ctx);
 
-	if (!event_filter_match(event))
+	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		goto unlock;
 
 	/*
@@ -1065,13 +1042,14 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 					struct perf_event_context *ctx)
 {
 	struct perf_event *sub;
-	u64 tstamp = perf_event_time(event);
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	event->tstamp_enabled = ctx->time - event->total_time_enabled;
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+			sub->tstamp_enabled =
+				ctx->time - sub->total_time_enabled;
+		}
 	}
 }
 
@@ -1104,7 +1082,7 @@ static void __perf_event_enable(void *info)
 		goto unlock;
 	__perf_event_mark_enabled(event, ctx);
 
-	if (!event_filter_match(event))
+	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		goto unlock;
 
 	/*
@@ -1215,6 +1193,12 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 	return 0;
 }
 
+enum event_type_t {
+	EVENT_FLEXIBLE = 0x1,
+	EVENT_PINNED = 0x2,
+	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
@@ -1451,7 +1435,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (!event_filter_match(event))
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		if (group_can_go_on(event, cpuctx, 1))
@@ -1483,7 +1467,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (!event_filter_match(event))
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1710,7 +1694,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (!event_filter_match(event))
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		hwc = &event->hw;
@@ -3909,7 +3893,7 @@ static int perf_event_task_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
 	if (event->attr.comm || event->attr.mmap ||
@@ -4046,7 +4030,7 @@ static int perf_event_comm_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
 	if (event->attr.comm)
@@ -4194,7 +4178,7 @@ static int perf_event_mmap_match(struct perf_event *event,
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (!event_filter_match(event))
+	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
 	if ((!executable && event->attr.mmap_data) ||
diff --git a/trunk/kernel/trace/Makefile b/trunk/kernel/trace/Makefile
index 761c510a06c5..53f338190b26 100644
--- a/trunk/kernel/trace/Makefile
+++ b/trunk/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/trunk/kernel/trace/trace.c b/trunk/kernel/trace/trace.c
index dc53ecb80589..f8cf959bad45 100644
--- a/trunk/kernel/trace/trace.c
+++ b/trunk/kernel/trace/trace.c
@@ -1313,10 +1313,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 
 	__this_cpu_inc(user_stack_count);
 
+
+
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-		goto out_drop_count;
+		return;
 	entry	= ring_buffer_event_data(event);
 
 	entry->tgid		= current->tgid;
@@ -1331,8 +1333,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
 
- out_drop_count:
 	__this_cpu_dec(user_stack_count);
+
  out:
 	preempt_enable();
 }
diff --git a/trunk/lib/dynamic_debug.c b/trunk/lib/dynamic_debug.c
index b335acb43be2..3094318bfea7 100644
--- a/trunk/lib/dynamic_debug.c
+++ b/trunk/lib/dynamic_debug.c
@@ -141,10 +141,11 @@ static void ddebug_change(const struct ddebug_query *query,
 			else if (!dp->flags)
 				dt->num_enabled++;
 			dp->flags = newflags;
-			if (newflags)
-				dp->enabled = 1;
-			else
-				dp->enabled = 0;
+			if (newflags) {
+				jump_label_enable(&dp->enabled);
+			} else {
+				jump_label_disable(&dp->enabled);
+			}
 			if (verbose)
 				printk(KERN_INFO
 					"ddebug: changed %s:%d [%s]%s %s\n",
diff --git a/trunk/tools/perf/Makefile b/trunk/tools/perf/Makefile
index 2b5387d53ba5..1b9b13ee2a72 100644
--- a/trunk/tools/perf/Makefile
+++ b/trunk/tools/perf/Makefile
@@ -227,7 +227,7 @@ ifndef PERF_DEBUG
   CFLAGS_OPTIMIZE = -O6
 endif
 
-CFLAGS = -fno-omit-frame-pointer -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
+CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 EXTLIBS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
 ALL_LDFLAGS = $(LDFLAGS)
diff --git a/trunk/tools/perf/builtin-record.c b/trunk/tools/perf/builtin-record.c
index 7069bd3e90b3..7bc049035484 100644
--- a/trunk/tools/perf/builtin-record.c
+++ b/trunk/tools/perf/builtin-record.c
@@ -331,9 +331,6 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
 			else if (err ==  ENODEV && cpu_list) {
 				die("No such device - did you specify"
 					" an out-of-range profile CPU?\n");
-			} else if (err == ENOENT) {
-				die("%s event is not supported. ",
-				     event_name(evsel));
 			} else if (err == EINVAL && sample_id_all_avail) {
 				/*
 				 * Old kernel, no attr->sample_id_type_all field
diff --git a/trunk/tools/perf/builtin-sched.c b/trunk/tools/perf/builtin-sched.c
index abd4b8497bc4..7a4ebeb8b016 100644
--- a/trunk/tools/perf/builtin-sched.c
+++ b/trunk/tools/perf/builtin-sched.c
@@ -489,8 +489,7 @@ static void create_tasks(void)
 
 	err = pthread_attr_init(&attr);
 	BUG_ON(err);
-	err = pthread_attr_setstacksize(&attr,
-			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
+	err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
 	BUG_ON(err);
 	err = pthread_mutex_lock(&start_work_mutex);
 	BUG_ON(err);
@@ -1862,7 +1861,7 @@ static int __cmd_record(int argc, const char **argv)
 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 
-	if (rec_argv == NULL)
+	if (rec_argv)
 		return -ENOMEM;
 
 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
diff --git a/trunk/tools/perf/builtin-stat.c b/trunk/tools/perf/builtin-stat.c
index c385a63ebfd1..02b2d8013a61 100644
--- a/trunk/tools/perf/builtin-stat.c
+++ b/trunk/tools/perf/builtin-stat.c
@@ -316,8 +316,6 @@ static int run_perf_stat(int argc __used, const char **argv)
 				      "\t Consider tweaking"
 				      " /proc/sys/kernel/perf_event_paranoid or running as root.",
 				      system_wide ? "system-wide " : "");
-			} else if (errno == ENOENT) {
-				error("%s event is not supported. ", event_name(counter));
 			} else {
 				error("open_counter returned with %d (%s). "
 				      "/bin/dmesg may provide additional information.\n",
@@ -685,7 +683,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		nr_counters = ARRAY_SIZE(default_attrs);
 
 		for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
-			pos = perf_evsel__new(&default_attrs[c],
+			pos = perf_evsel__new(default_attrs[c].type,
+					      default_attrs[c].config,
 					      nr_counters);
 			if (pos == NULL)
 				goto out;
diff --git a/trunk/tools/perf/builtin-test.c b/trunk/tools/perf/builtin-test.c
index ed5696198d3d..1c984342a579 100644
--- a/trunk/tools/perf/builtin-test.c
+++ b/trunk/tools/perf/builtin-test.c
@@ -234,7 +234,6 @@ static int test__vmlinux_matches_kallsyms(void)
 	return err;
 }
 
-#include "util/cpumap.h"
 #include "util/evsel.h"
 #include <sys/types.h>
 
@@ -265,7 +264,6 @@ static int test__open_syscall_event(void)
 	int err = -1, fd;
 	struct thread_map *threads;
 	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
 	unsigned int nr_open_calls = 111, i;
 	int id = trace_event__id("sys_enter_open");
 
@@ -280,10 +278,7 @@ static int test__open_syscall_event(void)
 		return -1;
 	}
 
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.config = id;
-	evsel = perf_evsel__new(&attr, 0);
+	evsel = perf_evsel__new(PERF_TYPE_TRACEPOINT, id, 0);
 	if (evsel == NULL) {
 		pr_debug("perf_evsel__new\n");
 		goto out_thread_map_delete;
@@ -322,111 +317,6 @@ static int test__open_syscall_event(void)
 	return err;
 }
 
-#include <sched.h>
-
-static int test__open_syscall_event_on_all_cpus(void)
-{
-	int err = -1, fd, cpu;
-	struct thread_map *threads;
-	struct cpu_map *cpus;
-	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
-	unsigned int nr_open_calls = 111, i;
-	cpu_set_t *cpu_set;
-	size_t cpu_set_size;
-	int id = trace_event__id("sys_enter_open");
-
-	if (id < 0) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-		return -1;
-	}
-
-	threads = thread_map__new(-1, getpid());
-	if (threads == NULL) {
-		pr_debug("thread_map__new\n");
-		return -1;
-	}
-
-	cpus = cpu_map__new(NULL);
-	if (threads == NULL) {
-		pr_debug("thread_map__new\n");
-		return -1;
-	}
-
-	cpu_set = CPU_ALLOC(cpus->nr);
-
-	if (cpu_set == NULL)
-		goto out_thread_map_delete;
-
-	cpu_set_size = CPU_ALLOC_SIZE(cpus->nr);
-	CPU_ZERO_S(cpu_set_size, cpu_set);
-
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.config = id;
-	evsel = perf_evsel__new(&attr, 0);
-	if (evsel == NULL) {
-		pr_debug("perf_evsel__new\n");
-		goto out_cpu_free;
-	}
-
-	if (perf_evsel__open(evsel, cpus, threads) < 0) {
-		pr_debug("failed to open counter: %s, "
-			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
-			 strerror(errno));
-		goto out_evsel_delete;
-	}
-
-	for (cpu = 0; cpu < cpus->nr; ++cpu) {
-		unsigned int ncalls = nr_open_calls + cpu;
-
-		CPU_SET(cpu, cpu_set);
-		sched_setaffinity(0, cpu_set_size, cpu_set);
-		for (i = 0; i < ncalls; ++i) {
-			fd = open("/etc/passwd", O_RDONLY);
-			close(fd);
-		}
-		CPU_CLR(cpu, cpu_set);
-	}
-
-	/*
-	 * Here we need to explicitely preallocate the counts, as if
-	 * we use the auto allocation it will allocate just for 1 cpu,
-	 * as we start by cpu 0.
-	 */
-	if (perf_evsel__alloc_counts(evsel, cpus->nr) < 0) {
-		pr_debug("perf_evsel__alloc_counts(ncpus=%d)\n", cpus->nr);
-		goto out_close_fd;
-	}
-
-	for (cpu = 0; cpu < cpus->nr; ++cpu) {
-		unsigned int expected;
-
-		if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
-			pr_debug("perf_evsel__open_read_on_cpu\n");
-			goto out_close_fd;
-		}
-
-		expected = nr_open_calls + cpu;
-		if (evsel->counts->cpu[cpu].val != expected) {
-			pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %Ld\n",
-				 expected, cpu, evsel->counts->cpu[cpu].val);
-			goto out_close_fd;
-		}
-	}
-
-	err = 0;
-out_close_fd:
-	perf_evsel__close_fd(evsel, 1, threads->nr);
-out_evsel_delete:
-	perf_evsel__delete(evsel);
-out_cpu_free:
-	CPU_FREE(cpu_set);
-out_thread_map_delete:
-	thread_map__delete(threads);
-	return err;
-}
-
 static struct test {
 	const char *desc;
 	int (*func)(void);
@@ -439,10 +329,6 @@ static struct test {
 		.desc = "detect open syscall event",
 		.func = test__open_syscall_event,
 	},
-	{
-		.desc = "detect open syscall event on all cpus",
-		.func = test__open_syscall_event_on_all_cpus,
-	},
 	{
 		.func = NULL,
 	},
diff --git a/trunk/tools/perf/builtin-top.c b/trunk/tools/perf/builtin-top.c
index 6ce4042421bd..1e67ab9c7ebc 100644
--- a/trunk/tools/perf/builtin-top.c
+++ b/trunk/tools/perf/builtin-top.c
@@ -1247,8 +1247,6 @@ static void start_counter(int i, struct perf_evsel *evsel)
 				die("Permission error - are you root?\n"
 					"\t Consider tweaking"
 					" /proc/sys/kernel/perf_event_paranoid.\n");
-			if (err == ENOENT)
-				die("%s event is not supported. ", event_name(evsel));
 			/*
 			 * If it's cycles then fall back to hrtimer
 			 * based cpu-clock-tick sw counter, which
diff --git a/trunk/tools/perf/util/evsel.c b/trunk/tools/perf/util/evsel.c
index f5cfed60af98..c95267e63c5b 100644
--- a/trunk/tools/perf/util/evsel.c
+++ b/trunk/tools/perf/util/evsel.c
@@ -6,13 +6,14 @@
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
-struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
+struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx)
 {
 	struct perf_evsel *evsel = zalloc(sizeof(*evsel));
 
 	if (evsel != NULL) {
 		evsel->idx	   = idx;
-		evsel->attr	   = *attr;
+		evsel->attr.type   = type;
+		evsel->attr.config = config;
 		INIT_LIST_HEAD(&evsel->node);
 	}
 
@@ -127,75 +128,59 @@ int __perf_evsel__read(struct perf_evsel *evsel,
 	return 0;
 }
 
-static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-			      struct thread_map *threads)
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
 {
-	int cpu, thread;
+	int cpu;
 
-	if (evsel->fd == NULL &&
-	    perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
+	if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, cpus->nr, 1) < 0)
 		return -1;
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
-		for (thread = 0; thread < threads->nr; thread++) {
-			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
-								     threads->map[thread],
-								     cpus->map[cpu], -1, 0);
-			if (FD(evsel, cpu, thread) < 0)
-				goto out_close;
-		}
+		FD(evsel, cpu, 0) = sys_perf_event_open(&evsel->attr, -1,
+							cpus->map[cpu], -1, 0);
+		if (FD(evsel, cpu, 0) < 0)
+			goto out_close;
 	}
 
 	return 0;
 
 out_close:
-	do {
-		while (--thread >= 0) {
-			close(FD(evsel, cpu, thread));
-			FD(evsel, cpu, thread) = -1;
-		}
-		thread = threads->nr;
-	} while (--cpu >= 0);
+	while (--cpu >= 0) {
+		close(FD(evsel, cpu, 0));
+		FD(evsel, cpu, 0) = -1;
+	}
 	return -1;
 }
 
-static struct {
-	struct cpu_map map;
-	int cpus[1];
-} empty_cpu_map = {
-	.map.nr	= 1,
-	.cpus	= { -1, },
-};
-
-static struct {
-	struct thread_map map;
-	int threads[1];
-} empty_thread_map = {
-	.map.nr	 = 1,
-	.threads = { -1, },
-};
-
-int perf_evsel__open(struct perf_evsel *evsel,
-		     struct cpu_map *cpus, struct thread_map *threads)
+int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
 {
+	int thread;
+
+	if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, 1, threads->nr))
+		return -1;
 
-	if (cpus == NULL) {
-		/* Work around old compiler warnings about strict aliasing */
-		cpus = &empty_cpu_map.map;
+	for (thread = 0; thread < threads->nr; thread++) {
+		FD(evsel, 0, thread) = sys_perf_event_open(&evsel->attr,
+							   threads->map[thread], -1, -1, 0);
+		if (FD(evsel, 0, thread) < 0)
+			goto out_close;
 	}
 
-	if (threads == NULL)
-		threads = &empty_thread_map.map;
+	return 0;
 
-	return __perf_evsel__open(evsel, cpus, threads);
+out_close:
+	while (--thread >= 0) {
+		close(FD(evsel, 0, thread));
+		FD(evsel, 0, thread) = -1;
+	}
+	return -1;
 }
 
-int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
+int perf_evsel__open(struct perf_evsel *evsel, 
+		     struct cpu_map *cpus, struct thread_map *threads)
 {
-	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map);
-}
+	if (threads == NULL)
+		return perf_evsel__open_per_cpu(evsel, cpus);
 
-int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
-{
-	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads);
+	return perf_evsel__open_per_thread(evsel, threads);
 }
diff --git a/trunk/tools/perf/util/evsel.h b/trunk/tools/perf/util/evsel.h
index b2d755fe88a5..a0ccd69c3fc2 100644
--- a/trunk/tools/perf/util/evsel.h
+++ b/trunk/tools/perf/util/evsel.h
@@ -37,7 +37,7 @@ struct perf_evsel {
 struct cpu_map;
 struct thread_map;
 
-struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
+struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
diff --git a/trunk/tools/perf/util/parse-events.c b/trunk/tools/perf/util/parse-events.c
index 5cb6f4bde905..649083f27e08 100644
--- a/trunk/tools/perf/util/parse-events.c
+++ b/trunk/tools/perf/util/parse-events.c
@@ -490,31 +490,6 @@ parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
 	return EVT_HANDLED_ALL;
 }
 
-static int store_event_type(const char *orgname)
-{
-	char filename[PATH_MAX], *c;
-	FILE *file;
-	int id, n;
-
-	sprintf(filename, "%s/", debugfs_path);
-	strncat(filename, orgname, strlen(orgname));
-	strcat(filename, "/id");
-
-	c = strchr(filename, ':');
-	if (c)
-		*c = '/';
-
-	file = fopen(filename, "r");
-	if (!file)
-		return 0;
-	n = fscanf(file, "%i", &id);
-	fclose(file);
-	if (n < 1) {
-		pr_err("cannot store event ID\n");
-		return -EINVAL;
-	}
-	return perf_header__push_event(id, orgname);
-}
 
 static enum event_result parse_tracepoint_event(const char **strp,
 				    struct perf_event_attr *attr)
@@ -558,13 +533,9 @@ static enum event_result parse_tracepoint_event(const char **strp,
 		*strp += strlen(sys_name) + evt_length;
 		return parse_multiple_tracepoint_event(sys_name, evt_name,
 						       flags);
-	} else {
-		if (store_event_type(evt_name) < 0)
-			return EVT_FAILED;
-
+	} else
 		return parse_single_tracepoint_event(sys_name, evt_name,
 						     evt_length, attr, strp);
-	}
 }
 
 static enum event_result
@@ -807,11 +778,41 @@ parse_event_symbols(const char **str, struct perf_event_attr *attr)
 	return ret;
 }
 
+static int store_event_type(const char *orgname)
+{
+	char filename[PATH_MAX], *c;
+	FILE *file;
+	int id, n;
+
+	sprintf(filename, "%s/", debugfs_path);
+	strncat(filename, orgname, strlen(orgname));
+	strcat(filename, "/id");
+
+	c = strchr(filename, ':');
+	if (c)
+		*c = '/';
+
+	file = fopen(filename, "r");
+	if (!file)
+		return 0;
+	n = fscanf(file, "%i", &id);
+	fclose(file);
+	if (n < 1) {
+		pr_err("cannot store event ID\n");
+		return -EINVAL;
+	}
+	return perf_header__push_event(id, orgname);
+}
+
 int parse_events(const struct option *opt __used, const char *str, int unset __used)
 {
 	struct perf_event_attr attr;
 	enum event_result ret;
 
+	if (strchr(str, ':'))
+		if (store_event_type(str) < 0)
+			return -1;
+
 	for (;;) {
 		memset(&attr, 0, sizeof(attr));
 		ret = parse_event_symbols(&str, &attr);
@@ -823,7 +824,7 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
 
 		if (ret != EVT_HANDLED_ALL) {
 			struct perf_evsel *evsel;
-			evsel = perf_evsel__new(&attr,
+			evsel = perf_evsel__new(attr.type, attr.config,
 						nr_counters);
 			if (evsel == NULL)
 				return -1;
@@ -1013,15 +1014,8 @@ void print_events(void)
 
 int perf_evsel_list__create_default(void)
 {
-	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_HARDWARE;
-	attr.config = PERF_COUNT_HW_CPU_CYCLES;
-
-	evsel = perf_evsel__new(&attr, 0);
-
+	struct perf_evsel *evsel = perf_evsel__new(PERF_TYPE_HARDWARE,
+						   PERF_COUNT_HW_CPU_CYCLES, 0);
 	if (evsel == NULL)
 		return -ENOMEM;
 
diff --git a/trunk/tools/perf/util/session.c b/trunk/tools/perf/util/session.c
index 313dac2d94ce..6fb4694d05fa 100644
--- a/trunk/tools/perf/util/session.c
+++ b/trunk/tools/perf/util/session.c
@@ -1007,7 +1007,7 @@ int __perf_session__process_events(struct perf_session *session,
 	if (size == 0)
 		size = 8;
 
-	if (head + event->header.size > mmap_size) {
+	if (head + event->header.size >= mmap_size) {
 		if (mmaps[map_idx]) {
 			munmap(mmaps[map_idx], mmap_size);
 			mmaps[map_idx] = NULL;