From 7398bb460b7a07b923a08ba922345caef2dff22b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 4 Jun 2009 19:14:22 -0700
Subject: [PATCH] --- yaml --- r: 147897 b: refs/heads/master c:
 8c5dd8f43367f4f266dd616f11658005bc2d20ef h: refs/heads/master i:   147895:
 74d9b71b844dbf483b2531733ee2bbfce64938eb v: v3

---
 [refs]                                        |    2 +-
 trunk/MAINTAINERS                             |    8 +-
 trunk/arch/alpha/kernel/osf_sys.c             |    3 +
 trunk/arch/m68k/include/asm/m520xsim.h        |    9 -
 trunk/arch/m68k/include/asm/m523xsim.h        |    9 -
 trunk/arch/m68k/include/asm/m527xsim.h        |    9 -
 trunk/arch/m68k/include/asm/m528xsim.h        |    8 -
 trunk/arch/m68k/include/asm/m532xsim.h        |   12 -
 trunk/arch/m68k/include/asm/processor_no.h    |    8 +-
 trunk/arch/m68k/include/asm/swab.h            |    2 +-
 trunk/arch/m68k/include/asm/system_no.h       |  107 +
 trunk/arch/m68knommu/kernel/entry.S           |    1 +
 trunk/arch/m68knommu/kernel/setup.c           |   16 +-
 trunk/arch/m68knommu/mm/init.c                |    4 +-
 trunk/arch/m68knommu/platform/5206/config.c   |   18 +-
 trunk/arch/m68knommu/platform/5206e/config.c  |   18 +-
 trunk/arch/m68knommu/platform/520x/config.c   |   15 +-
 trunk/arch/m68knommu/platform/523x/config.c   |   14 +-
 trunk/arch/m68knommu/platform/5249/config.c   |   18 +-
 trunk/arch/m68knommu/platform/5272/config.c   |   18 +-
 trunk/arch/m68knommu/platform/527x/config.c   |   15 +-
 trunk/arch/m68knommu/platform/528x/config.c   |   13 +-
 trunk/arch/m68knommu/platform/5307/config.c   |   16 +-
 trunk/arch/m68knommu/platform/532x/config.c   |   12 +-
 trunk/arch/m68knommu/platform/5407/config.c   |   16 +-
 .../m68knommu/platform/coldfire/vectors.c     |    7 +
 trunk/arch/powerpc/include/asm/hw_irq.h       |    3 +
 trunk/arch/powerpc/kernel/irq.c               |    1 -
 trunk/arch/x86/kernel/apic/io_apic.c          |    4 +-
 trunk/arch/x86/kernel/setup.c                 |   15 +-
 trunk/block/blk-core.c                        |   21 +-
 trunk/block/blk-settings.c                    |    6 +-
 trunk/drivers/ieee1394/dv1394.c               |    5 +-
 trunk/drivers/ieee1394/ieee1394_core.h        |    6 +-
 trunk/drivers/usb/core/inode.c                |    5 -
 trunk/fs/adfs/adfs.h                          |    4 +-
 trunk/fs/adfs/dir.c                           |   10 +-
 trunk/fs/adfs/dir_f.c                         |   17 -
 trunk/fs/adfs/dir_fplus.c                     |   17 -
 trunk/fs/adfs/file.c                          |    2 +-
 trunk/fs/adfs/inode.c                         |    4 +-
 trunk/fs/adfs/map.c                           |    2 +-
 trunk/fs/adfs/super.c                         |    4 -
 trunk/fs/affs/affs.h                          |    1 -
 trunk/fs/affs/dir.c                           |    2 +-
 trunk/fs/affs/file.c                          |   14 +-
 trunk/fs/affs/super.c                         |   54 +-
 trunk/fs/afs/mntpt.c                          |    2 +-
 trunk/fs/afs/super.c                          |    4 -
 trunk/fs/autofs/dirhash.c                     |    5 +-
 trunk/fs/autofs4/autofs_i.h                   |    6 +-
 trunk/fs/autofs4/dev-ioctl.c                  |  195 +-
 trunk/fs/autofs4/expire.c                     |   15 +-
 trunk/fs/autofs4/root.c                       |    7 +-
 trunk/fs/befs/linuxvfs.c                      |    5 +-
 trunk/fs/bfs/dir.c                            |    8 +-
 trunk/fs/bfs/inode.c                          |   52 +-
 trunk/fs/block_dev.c                          |   19 +-
 trunk/fs/btrfs/Makefile                       |    4 +-
 trunk/fs/btrfs/acl.c                          |    5 +
 trunk/fs/btrfs/async-thread.c                 |    2 +-
 trunk/fs/btrfs/btrfs_inode.h                  |    4 +-
 trunk/fs/btrfs/compression.c                  |    6 +-
 trunk/fs/btrfs/crc32c.h                       |   29 +
 trunk/fs/btrfs/ctree.c                        |  698 ++--
 trunk/fs/btrfs/ctree.h                        |  330 +-
 trunk/fs/btrfs/delayed-ref.c                  |  509 +--
 trunk/fs/btrfs/delayed-ref.h                  |   85 +-
 trunk/fs/btrfs/disk-io.c                      |  164 +-
 trunk/fs/btrfs/export.c                       |    4 +-
 trunk/fs/btrfs/extent-tree.c                  | 2672 ++++--------
 trunk/fs/btrfs/extent_io.c                    |   18 +-
 trunk/fs/btrfs/file.c                         |   78 +-
 trunk/fs/btrfs/free-space-cache.c             |   10 +-
 trunk/fs/btrfs/free-space-cache.h             |    1 -
 trunk/fs/btrfs/hash.h                         |    4 +-
 trunk/fs/btrfs/inode.c                        |  166 +-
 trunk/fs/btrfs/ioctl.c                        |  197 +-
 trunk/fs/btrfs/print-tree.c                   |  155 +-
 trunk/fs/btrfs/relocation.c                   | 3711 -----------------
 trunk/fs/btrfs/root-tree.c                    |   17 +-
 trunk/fs/btrfs/super.c                        |   64 +-
 trunk/fs/btrfs/transaction.c                  |  410 +-
 trunk/fs/btrfs/transaction.h                  |   12 +-
 trunk/fs/btrfs/tree-log.c                     |  103 +-
 trunk/fs/btrfs/volumes.c                      |   69 +-
 trunk/fs/btrfs/volumes.h                      |   12 +-
 trunk/fs/cachefiles/interface.c               |    4 +-
 trunk/fs/char_dev.c                           |   14 +-
 trunk/fs/cifs/cifs_dfs_ref.c                  |    2 +-
 trunk/fs/cifs/cifsfs.c                        |    6 +-
 trunk/fs/compat.c                             |    2 +
 trunk/fs/dcache.c                             |    7 +-
 trunk/fs/ecryptfs/super.c                     |    5 -
 trunk/fs/exofs/super.c                        |   25 +-
 trunk/fs/ext2/Makefile                        |    2 +-
 trunk/fs/ext2/dir.c                           |    2 +-
 trunk/fs/ext2/ext2.h                          |    3 +
 trunk/fs/ext2/file.c                          |    4 +-
 trunk/fs/ext2/fsync.c                         |   50 +
 trunk/fs/ext2/inode.c                         |   11 +-
 trunk/fs/ext2/super.c                         |   60 +-
 trunk/fs/ext3/balloc.c                        |    3 +-
 trunk/fs/ext3/ialloc.c                        |    3 +-
 trunk/fs/ext3/inode.c                         |    1 +
 trunk/fs/ext3/resize.c                        |    2 +
 trunk/fs/ext3/super.c                         |   34 +-
 trunk/fs/ext3/xattr.c                         |    1 +
 trunk/fs/ext4/super.c                         |   16 +-
 trunk/fs/fat/dir.c                            |   16 +-
 trunk/fs/fat/fat.h                            |    6 -
 trunk/fs/fat/fatent.c                         |   13 +-
 trunk/fs/fat/file.c                           |   14 +-
 trunk/fs/fat/inode.c                          |   31 +-
 trunk/fs/fat/namei_msdos.c                    |    4 +-
 trunk/fs/fat/namei_vfat.c                     |    4 +-
 trunk/fs/file_table.c                         |   40 +-
 trunk/fs/freevxfs/vxfs_super.c                |    4 -
 trunk/fs/fs-writeback.c                       |   92 +-
 trunk/fs/gfs2/log.c                           |    2 +
 trunk/fs/gfs2/super.c                         |   15 +-
 trunk/fs/hfs/super.c                          |   23 +-
 trunk/fs/hfsplus/super.c                      |   25 +-
 trunk/fs/hpfs/super.c                         |   12 -
 trunk/fs/inode.c                              |   12 +-
 trunk/fs/internal.h                           |   17 -
 trunk/fs/isofs/inode.c                        |    5 -
 trunk/fs/jffs2/fs.c                           |   18 +-
 trunk/fs/jffs2/os-linux.h                     |    1 +
 trunk/fs/jffs2/super.c                        |   26 -
 trunk/fs/jfs/super.c                          |   27 +-
 trunk/fs/libfs.c                              |   25 -
 trunk/fs/minix/dir.c                          |    2 +-
 trunk/fs/minix/file.c                         |   20 +-
 trunk/fs/minix/inode.c                        |   37 +-
 trunk/fs/minix/minix.h                        |    2 +
 trunk/fs/namei.c                              |  129 +-
 trunk/fs/namespace.c                          |  327 +-
 trunk/fs/ncpfs/inode.c                        |    4 -
 trunk/fs/nfs/namespace.c                      |    2 +-
 trunk/fs/nfs/super.c                          |    2 -
 trunk/fs/nfsd/export.c                        |   78 +-
 trunk/fs/nfsd/vfs.c                           |   54 +-
 trunk/fs/nilfs2/cpfile.c                      |    6 +-
 trunk/fs/nilfs2/sb.h                          |    1 -
 trunk/fs/nilfs2/super.c                       |  256 +-
 trunk/fs/nilfs2/the_nilfs.c                   |  113 +-
 trunk/fs/nilfs2/the_nilfs.h                   |   23 +-
 trunk/fs/notify/Kconfig                       |   13 -
 trunk/fs/notify/Makefile                      |    2 -
 trunk/fs/notify/dnotify/Kconfig               |    1 -
 trunk/fs/notify/dnotify/dnotify.c             |  464 +--
 trunk/fs/notify/fsnotify.c                    |  186 -
 trunk/fs/notify/fsnotify.h                    |   34 -
 trunk/fs/notify/group.c                       |  254 --
 trunk/fs/notify/inode_mark.c                  |  426 --
 trunk/fs/notify/inotify/Kconfig               |   20 +-
 trunk/fs/notify/inotify/Makefile              |    2 +-
 trunk/fs/notify/inotify/inotify.c             |   20 -
 trunk/fs/notify/inotify/inotify.h             |   21 -
 trunk/fs/notify/inotify/inotify_fsnotify.c    |  138 -
 trunk/fs/notify/inotify/inotify_user.c        |  837 ++--
 trunk/fs/notify/notification.c                |  411 --
 trunk/fs/ntfs/super.c                         |   54 +-
 trunk/fs/ocfs2/super.c                        |   22 +-
 trunk/fs/omfs/file.c                          |   17 +-
 trunk/fs/open.c                               |    4 +-
 trunk/fs/proc/internal.h                      |   25 -
 trunk/fs/proc/proc_devtree.c                  |    1 -
 trunk/fs/qnx4/Makefile                        |    2 +-
 trunk/fs/qnx4/bitmap.c                        |    7 +-
 trunk/fs/qnx4/dir.c                           |    9 +-
 trunk/fs/qnx4/file.c                          |    5 +-
 trunk/fs/qnx4/fsync.c                         |  169 +
 trunk/fs/qnx4/inode.c                         |   58 +-
 trunk/fs/qnx4/namei.c                         |   13 +-
 trunk/fs/qnx4/qnx4.h                          |   57 -
 trunk/fs/qnx4/truncate.c                      |    6 +-
 trunk/fs/quota/quota.c                        |   25 +-
 trunk/fs/reiserfs/dir.c                       |   10 +-
 trunk/fs/reiserfs/super.c                     |   33 +-
 trunk/fs/reiserfs/xattr.c                     |    3 +-
 trunk/fs/smbfs/inode.c                        |    4 -
 trunk/fs/squashfs/super.c                     |    4 -
 trunk/fs/super.c                              |  192 +-
 trunk/fs/sync.c                               |  117 +-
 trunk/fs/sysv/dir.c                           |    2 +-
 trunk/fs/sysv/file.c                          |   17 +-
 trunk/fs/sysv/inode.c                         |   75 +-
 trunk/fs/sysv/sysv.h                          |    1 +
 trunk/fs/ubifs/super.c                        |   17 +-
 trunk/fs/udf/Makefile                         |    2 +-
 trunk/fs/udf/dir.c                            |    2 +-
 trunk/fs/udf/file.c                           |    2 +-
 trunk/fs/udf/fsync.c                          |   52 +
 trunk/fs/udf/super.c                          |   11 +-
 trunk/fs/udf/udfdecl.h                        |    3 +
 trunk/fs/ufs/dir.c                            |    2 +-
 trunk/fs/ufs/file.c                           |   23 +-
 trunk/fs/ufs/super.c                          |   65 +-
 trunk/fs/ufs/ufs.h                            |    1 +
 trunk/fs/xattr.c                              |    4 +-
 trunk/fs/xfs/linux-2.6/xfs_super.c            |   12 +
 trunk/fs/xfs/xfs_trans.c                      |    2 +
 trunk/include/linux/Kbuild                    |    2 +-
 trunk/include/linux/cdev.h                    |    2 -
 trunk/include/linux/cramfs_fs.h               |    3 +-
 trunk/include/linux/dcache.h                  |   11 +-
 trunk/include/linux/dnotify.h                 |   29 +-
 trunk/include/linux/fs.h                      |   27 +-
 trunk/include/linux/fsnotify.h                |  199 +-
 trunk/include/linux/fsnotify_backend.h        |  387 --
 trunk/include/linux/magic.h                   |    2 -
 trunk/include/linux/mount.h                   |   25 +-
 trunk/include/linux/namei.h                   |    5 +-
 trunk/include/linux/nfsd/export.h             |    6 +-
 trunk/include/linux/proc_fs.h                 |   24 +
 trunk/include/linux/qnx4_fs.h                 |   61 +
 trunk/include/linux/quotaops.h                |   20 +-
 trunk/include/linux/reiserfs_fs_sb.h          |    2 -
 trunk/include/linux/writeback.h               |    1 +
 trunk/init/Kconfig                            |    3 +-
 trunk/kernel/audit_tree.c                     |    6 +-
 trunk/kernel/cgroup.c                         |    3 -
 trunk/virt/kvm/kvm_main.c                     |    1 +
 225 files changed, 4890 insertions(+), 12065 deletions(-)
 create mode 100644 trunk/fs/btrfs/crc32c.h
 delete mode 100644 trunk/fs/btrfs/relocation.c
 create mode 100644 trunk/fs/ext2/fsync.c
 delete mode 100644 trunk/fs/notify/fsnotify.c
 delete mode 100644 trunk/fs/notify/fsnotify.h
 delete mode 100644 trunk/fs/notify/group.c
 delete mode 100644 trunk/fs/notify/inode_mark.c
 delete mode 100644 trunk/fs/notify/inotify/inotify.h
 delete mode 100644 trunk/fs/notify/inotify/inotify_fsnotify.c
 delete mode 100644 trunk/fs/notify/notification.c
 create mode 100644 trunk/fs/qnx4/fsync.c
 delete mode 100644 trunk/fs/qnx4/qnx4.h
 create mode 100644 trunk/fs/udf/fsync.c
 delete mode 100644 trunk/include/linux/fsnotify_backend.h

diff --git a/[refs] b/[refs]
index 3ec96a39b723..ad5cbea34100 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 8ebf975608aaebd7feb33d77f07ba21a6380e086
+refs/heads/master: 8c5dd8f43367f4f266dd616f11658005bc2d20ef
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index c944d618dc83..1a0084e22cf3 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -1802,10 +1802,10 @@ F:	drivers/char/epca*
 F:	drivers/char/digi*
 
 DIRECTORY NOTIFICATION (DNOTIFY)
-P:	Eric Paris
-M:	eparis@parisplace.org
+P:	Stephen Rothwell
+M:	sfr@canb.auug.org.au
 L:	linux-kernel@vger.kernel.org
-S:	Maintained
+S:	Supported
 F:	Documentation/filesystems/dnotify.txt
 F:	fs/notify/dnotify/
 F:	include/linux/dnotify.h
@@ -2858,8 +2858,6 @@ P:	John McCutchan
 M:	john@johnmccutchan.com
 P:	Robert Love
 M:	rlove@rlove.org
-P:	Eric Paris
-M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/inotify.txt
diff --git a/trunk/arch/alpha/kernel/osf_sys.c b/trunk/arch/alpha/kernel/osf_sys.c
index 9a3334ae282e..42ee05981e71 100644
--- a/trunk/arch/alpha/kernel/osf_sys.c
+++ b/trunk/arch/alpha/kernel/osf_sys.c
@@ -371,6 +371,8 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
 	int retval = -EINVAL;
 	char *name;
 
+	lock_kernel();
+
 	name = getname(path);
 	retval = PTR_ERR(name);
 	if (IS_ERR(name))
@@ -390,6 +392,7 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
 	}
 	putname(name);
  out:
+	unlock_kernel();
 	return retval;
 }
 
diff --git a/trunk/arch/m68k/include/asm/m520xsim.h b/trunk/arch/m68k/include/asm/m520xsim.h
index 83bbcfd6e8f2..49d016e6391a 100644
--- a/trunk/arch/m68k/include/asm/m520xsim.h
+++ b/trunk/arch/m68k/include/asm/m520xsim.h
@@ -59,14 +59,5 @@
 #define MCFPIT_IMR		MCFINTC_IMRL
 #define MCFPIT_IMR_IBIT		(1 << MCFINT_PIT1)
 
-/*
- *  Reset Controll Unit.
- */
-#define	MCF_RCR			0xFC0A0000
-#define	MCF_RSR			0xFC0A0001
-
-#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
-#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
-
 /****************************************************************************/
 #endif  /* m520xsim_h */
diff --git a/trunk/arch/m68k/include/asm/m523xsim.h b/trunk/arch/m68k/include/asm/m523xsim.h
index 55183b5df1b8..bf397313e93f 100644
--- a/trunk/arch/m68k/include/asm/m523xsim.h
+++ b/trunk/arch/m68k/include/asm/m523xsim.h
@@ -41,14 +41,5 @@
 #define	MCFSIM_DACR1		0x50		/* SDRAM base address 1 */
 #define	MCFSIM_DMR1		0x54		/* SDRAM address mask 1 */
 
-/*
- *  Reset Controll Unit (relative to IPSBAR).
- */
-#define	MCF_RCR			0x110000
-#define	MCF_RSR			0x110001
-
-#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
-#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
-
 /****************************************************************************/
 #endif	/* m523xsim_h */
diff --git a/trunk/arch/m68k/include/asm/m527xsim.h b/trunk/arch/m68k/include/asm/m527xsim.h
index 95f4f8ee8f7c..1f63ab3fb3e6 100644
--- a/trunk/arch/m68k/include/asm/m527xsim.h
+++ b/trunk/arch/m68k/include/asm/m527xsim.h
@@ -70,14 +70,5 @@
 #define UART2_ENABLE_MASK	0x3f00 
 #endif
 
-/*
- *  Reset Controll Unit (relative to IPSBAR).
- */
-#define	MCF_RCR			0x110000
-#define	MCF_RSR			0x110001
-
-#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
-#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
-
 /****************************************************************************/
 #endif	/* m527xsim_h */
diff --git a/trunk/arch/m68k/include/asm/m528xsim.h b/trunk/arch/m68k/include/asm/m528xsim.h
index d79c49f8134a..28bf783a5d6d 100644
--- a/trunk/arch/m68k/include/asm/m528xsim.h
+++ b/trunk/arch/m68k/include/asm/m528xsim.h
@@ -56,14 +56,6 @@
 #define MCF5282_INTC0_ICR17     (volatile u8 *) (MCF_IPSBAR + 0x0C51)
 
 
-/*
- *  Reset Control Unit (relative to IPSBAR).
- */
-#define	MCF_RCR			0x110000
-#define	MCF_RSR			0x110001
-
-#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
-#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
 
 /*********************************************************************
 *
diff --git a/trunk/arch/m68k/include/asm/m532xsim.h b/trunk/arch/m68k/include/asm/m532xsim.h
index eb7fd4448947..ce603451b55e 100644
--- a/trunk/arch/m68k/include/asm/m532xsim.h
+++ b/trunk/arch/m68k/include/asm/m532xsim.h
@@ -125,18 +125,6 @@
 #define	ACR_CM_OFF_IMP		(3<<5)
 #define	ACR_WPROTECT		(1<<2)
 
-/*********************************************************************
- *
- * Reset Controller Module
- *
- *********************************************************************/
-
-#define	MCF_RCR			0xFC0A0000
-#define	MCF_RSR			0xFC0A0001
-
-#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
-#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
-
 /*********************************************************************
  *
  * Inter-IC (I2C) Module
diff --git a/trunk/arch/m68k/include/asm/processor_no.h b/trunk/arch/m68k/include/asm/processor_no.h
index 7a1e0ba35f5a..91cba18acdd3 100644
--- a/trunk/arch/m68k/include/asm/processor_no.h
+++ b/trunk/arch/m68k/include/asm/processor_no.h
@@ -72,10 +72,10 @@ struct thread_struct {
 	unsigned char  fpstate[FPSTATESIZE];  /* floating point state */
 };
 
-#define INIT_THREAD  {							\
-	.ksp	= sizeof(init_stack) + (unsigned long) init_stack,	\
-	.sr	= PS_S,							\
-	.fs	= __KERNEL_DS,						\
+#define INIT_THREAD  { \
+	sizeof(init_stack) + (unsigned long) init_stack, 0, \
+	PS_S, __KERNEL_DS, \
+	{0, 0}, 0, {0,}, {0, 0, 0}, {0,}, \
 }
 
 /*
diff --git a/trunk/arch/m68k/include/asm/swab.h b/trunk/arch/m68k/include/asm/swab.h
index 5b754aace744..9e3054ea59e9 100644
--- a/trunk/arch/m68k/include/asm/swab.h
+++ b/trunk/arch/m68k/include/asm/swab.h
@@ -1,7 +1,7 @@
 #ifndef _M68K_SWAB_H
 #define _M68K_SWAB_H
 
-#include <linux/types.h>
+#include <asm/types.h>
 #include <linux/compiler.h>
 
 #define __SWAB_64_THRU_32__
diff --git a/trunk/arch/m68k/include/asm/system_no.h b/trunk/arch/m68k/include/asm/system_no.h
index 3c0718d74398..4496c0aa8379 100644
--- a/trunk/arch/m68k/include/asm/system_no.h
+++ b/trunk/arch/m68k/include/asm/system_no.h
@@ -203,6 +203,113 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 #include <asm-generic/cmpxchg.h>
 #endif
 
+#if defined( CONFIG_M68328 ) || defined( CONFIG_M68EZ328 ) || \
+	defined (CONFIG_M68360) || defined( CONFIG_M68VZ328 )
+#define HARD_RESET_NOW() ({		\
+        local_irq_disable();		\
+        asm("				\
+        moveal #0x10c00000, %a0;	\
+        moveb #0, 0xFFFFF300;		\
+        moveal 0(%a0), %sp;		\
+        moveal 4(%a0), %a0;		\
+        jmp (%a0);			\
+        ");				\
+})
+#endif
+
+#ifdef CONFIG_COLDFIRE
+#if defined(CONFIG_M5272) && defined(CONFIG_NETtel)
+/*
+ * Need to account for broken early mask of 5272 silicon. So don't
+ * jump through the original start address. Jump strait into the
+ * known start of the FLASH code.
+ */
+#define HARD_RESET_NOW() ({		\
+        asm("				\
+	movew #0x2700, %sr;		\
+        jmp 0xf0000400;			\
+        ");				\
+})
+#elif defined(CONFIG_NETtel) || \
+      defined(CONFIG_SECUREEDGEMP3) || defined(CONFIG_CLEOPATRA)
+#define HARD_RESET_NOW() ({		\
+        asm("				\
+	movew #0x2700, %sr;		\
+	moveal #0x10000044, %a0;	\
+	movel #0xffffffff, (%a0);	\
+	moveal #0x10000001, %a0;	\
+	moveb #0x00, (%a0);		\
+        moveal #0xf0000004, %a0;	\
+        moveal (%a0), %a0;		\
+        jmp (%a0);			\
+        ");				\
+})
+#elif defined(CONFIG_M5272)
+/*
+ * Retrieve the boot address in flash using CSBR0 and CSOR0
+ * find the reset vector at flash_address + 4 (e.g. 0x400)
+ * remap it in the flash's current location (e.g. 0xf0000400)
+ * and jump there.
+ */ 
+#define HARD_RESET_NOW() ({		\
+	asm("				\
+	movew #0x2700, %%sr;		\
+	move.l	%0+0x40,%%d0;		\
+	and.l	%0+0x44,%%d0;		\
+	andi.l	#0xfffff000,%%d0;	\
+	mov.l	%%d0,%%a0;		\
+	or.l	4(%%a0),%%d0;		\
+	mov.l	%%d0,%%a0;		\
+	jmp (%%a0);"			\
+	: /* No output */		\
+	: "o" (*(char *)MCF_MBAR) );	\
+})
+#elif defined(CONFIG_M528x)
+/*
+ * The MCF528x has a bit (SOFTRST) in memory (Reset Control Register RCR),
+ * that when set, resets the MCF528x.
+ */
+#define HARD_RESET_NOW() \
+({						\
+	unsigned char volatile *reset;		\
+	asm("move.w	#0x2700, %sr");		\
+	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
+	while(1)				\
+	*reset |= (0x01 << 7);\
+})
+#elif defined(CONFIG_M523x)
+#define HARD_RESET_NOW() ({		\
+	asm("				\
+	movew #0x2700, %sr;		\
+	movel #0x01000000, %sp;		\
+	moveal #0x40110000, %a0;	\
+	moveb #0x80, (%a0);		\
+	");				\
+})
+#elif defined(CONFIG_M520x)
+	/*
+	 * The MCF5208 has a bit (SOFTRST) in memory (Reset Control Register 
+	 * RCR), that when set, resets the MCF5208.
+	 */
+#define HARD_RESET_NOW() 		\
+({					\
+	unsigned char volatile *reset;	\
+	asm("move.w     #0x2700, %sr");	\
+	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0xA0000));	\
+	while(1)			\
+		*reset |= 0x80;		\
+})
+#else
+#define HARD_RESET_NOW() ({		\
+        asm("				\
+	movew #0x2700, %sr;		\
+        moveal #0x4, %a0;		\
+        moveal (%a0), %a0;		\
+        jmp (%a0);			\
+        ");				\
+})
+#endif
+#endif
 #define arch_align_stack(x) (x)
 
 
diff --git a/trunk/arch/m68knommu/kernel/entry.S b/trunk/arch/m68knommu/kernel/entry.S
index f56faa5c9cd9..f4782d2dce8f 100644
--- a/trunk/arch/m68knommu/kernel/entry.S
+++ b/trunk/arch/m68knommu/kernel/entry.S
@@ -26,6 +26,7 @@
 
 #include <linux/sys.h>
 #include <linux/linkage.h>
+#include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/setup.h>
 #include <asm/segment.h>
diff --git a/trunk/arch/m68knommu/kernel/setup.c b/trunk/arch/m68knommu/kernel/setup.c
index 5c2bb3eeaaa2..5985f1989021 100644
--- a/trunk/arch/m68knommu/kernel/setup.c
+++ b/trunk/arch/m68knommu/kernel/setup.c
@@ -166,13 +166,15 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n");
 #endif
 
-	pr_debug("KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
-		 "BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
-		 (int) &_sdata, (int) &_edata,
-		 (int) &_sbss, (int) &_ebss);
-	pr_debug("MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ",
-		 (int) &_ebss, (int) memory_start,
-		 (int) memory_start, (int) memory_end);
+#ifdef DEBUG
+	printk(KERN_DEBUG "KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
+		"BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
+		(int) &_sdata, (int) &_edata,
+		(int) &_sbss, (int) &_ebss);
+	printk(KERN_DEBUG "MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ",
+		(int) &_ebss, (int) memory_start,
+		(int) memory_start, (int) memory_end);
+#endif
 
 	/* Keep a copy of command line */
 	*cmdline_p = &command_line[0];
diff --git a/trunk/arch/m68knommu/mm/init.c b/trunk/arch/m68knommu/mm/init.c
index b1703c67a4f1..7befc0c357e0 100644
--- a/trunk/arch/m68knommu/mm/init.c
+++ b/trunk/arch/m68knommu/mm/init.c
@@ -126,7 +126,9 @@ void __init mem_init(void)
 	unsigned long start_mem = memory_start; /* DAVIDM - these must start at end of kernel */
 	unsigned long end_mem   = memory_end; /* DAVIDM - this must not include kernel stack at top */
 
-	pr_debug("Mem_init: start=%lx, end=%lx\n", start_mem, end_mem);
+#ifdef DEBUG
+	printk(KERN_DEBUG "Mem_init: start=%lx, end=%lx\n", start_mem, end_mem);
+#endif
 
 	end_mem &= PAGE_MASK;
 	high_memory = (void *) end_mem;
diff --git a/trunk/arch/m68knommu/platform/5206/config.c b/trunk/arch/m68knommu/platform/5206/config.c
index f6f79874e9af..53a5920c2b71 100644
--- a/trunk/arch/m68knommu/platform/5206/config.c
+++ b/trunk/arch/m68knommu/platform/5206/config.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -20,6 +21,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m5206_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -104,21 +109,10 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
-void m5206_cpu_reset(void)
-{
-	local_irq_disable();
-	/* Set watchdog to soft reset, and enabled */
-	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
-	for (;;)
-		/* wait for watchdog to timeout */;
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
-	mach_reset = m5206_cpu_reset;
+	mach_reset = coldfire_reset;
 }
 
 /***************************************************************************/
diff --git a/trunk/arch/m68knommu/platform/5206e/config.c b/trunk/arch/m68knommu/platform/5206e/config.c
index 65887799db81..db902540bf2c 100644
--- a/trunk/arch/m68knommu/platform/5206e/config.c
+++ b/trunk/arch/m68knommu/platform/5206e/config.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -20,6 +21,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m5206e_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -104,17 +109,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
-void m5206e_cpu_reset(void)
-{
-	local_irq_disable();
-	/* Set watchdog to soft reset, and enabled */
-	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
-	for (;;)
-		/* wait for watchdog to timeout */;
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -125,7 +119,7 @@ void __init config_BSP(char *commandp, int size)
 	commandp[size-1] = 0;
 #endif /* CONFIG_NETtel */
 
-	mach_reset = m5206e_cpu_reset;
+	mach_reset = coldfire_reset;
 }
 
 /***************************************************************************/
diff --git a/trunk/arch/m68knommu/platform/520x/config.c b/trunk/arch/m68knommu/platform/520x/config.c
index 1c43a8aec69b..855fc6a79d72 100644
--- a/trunk/arch/m68knommu/platform/520x/config.c
+++ b/trunk/arch/m68knommu/platform/520x/config.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -22,6 +23,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m520x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -164,17 +169,9 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
-static void m520x_cpu_reset(void)
-{
-	local_irq_disable();
-	__raw_writeb(MCF_RCR_SWRESET, MCF_RCR);
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
-	mach_reset = m520x_cpu_reset;
+	mach_reset = coldfire_reset;
 	m520x_uarts_init();
 	m520x_fec_init();
 }
diff --git a/trunk/arch/m68knommu/platform/523x/config.c b/trunk/arch/m68knommu/platform/523x/config.c
index 961fefebca14..74133f27b30c 100644
--- a/trunk/arch/m68knommu/platform/523x/config.c
+++ b/trunk/arch/m68knommu/platform/523x/config.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -23,6 +24,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m523x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -140,20 +145,13 @@ void mcf_autovector(unsigned int vec)
 {
 	/* Everything is auto-vectored on the 523x */
 }
-/***************************************************************************/
-
-static void m523x_cpu_reset(void)
-{
-	local_irq_disable();
-	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
-}
 
 /***************************************************************************/
 
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_disableall();
-	mach_reset = m523x_cpu_reset;
+	mach_reset = coldfire_reset;
 	m523x_uarts_init();
 	m523x_fec_init();
 }
diff --git a/trunk/arch/m68knommu/platform/5249/config.c b/trunk/arch/m68knommu/platform/5249/config.c
index 93d998825925..9eab19d01eb1 100644
--- a/trunk/arch/m68knommu/platform/5249/config.c
+++ b/trunk/arch/m68knommu/platform/5249/config.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -19,6 +20,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m5249_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -101,21 +106,10 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
-void m5249_cpu_reset(void)
-{
-	local_irq_disable();
-	/* Set watchdog to soft reset, and enabled */
-	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
-	for (;;)
-		/* wait for watchdog to timeout */;
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
-	mach_reset = m5249_cpu_reset;
+	mach_reset = coldfire_reset;
 }
 
 /***************************************************************************/
diff --git a/trunk/arch/m68knommu/platform/5272/config.c b/trunk/arch/m68knommu/platform/5272/config.c
index 5f95fcde05fd..e049245f4092 100644
--- a/trunk/arch/m68knommu/platform/5272/config.c
+++ b/trunk/arch/m68knommu/platform/5272/config.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -20,6 +21,8 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -167,19 +170,6 @@ void mcf_settimericr(int timer, int level)
 
 /***************************************************************************/
 
-static void m5272_cpu_reset(void)
-{
-	local_irq_disable();
-	/* Set watchdog to reset, and enabled */
-	__raw_writew(0, MCF_MBAR + MCFSIM_WIRR);
-	__raw_writew(1, MCF_MBAR + MCFSIM_WRRR);
-	__raw_writew(0, MCF_MBAR + MCFSIM_WCR);
-	for (;;)
-		/* wait for watchdog to timeout */;
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 #if defined (CONFIG_MOD5272)
@@ -204,7 +194,7 @@ void __init config_BSP(char *commandp, int size)
 
 	mcf_timervector = 69;
 	mcf_profilevector = 70;
-	mach_reset = m5272_cpu_reset;
+	mach_reset = coldfire_reset;
 }
 
 /***************************************************************************/
diff --git a/trunk/arch/m68knommu/platform/527x/config.c b/trunk/arch/m68knommu/platform/527x/config.c
index f746439cfd3e..428b15922ef5 100644
--- a/trunk/arch/m68knommu/platform/527x/config.c
+++ b/trunk/arch/m68knommu/platform/527x/config.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -23,6 +24,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m527x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -222,18 +227,10 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
-static void m527x_cpu_reset(void)
-{
-	local_irq_disable();
-	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_disableall();
-	mach_reset = m527x_cpu_reset;
+	mach_reset = coldfire_reset;
 	m527x_uarts_init();
 	m527x_fec_init();
 }
diff --git a/trunk/arch/m68knommu/platform/528x/config.c b/trunk/arch/m68knommu/platform/528x/config.c
index a1d1a61c4fe6..bee526f4d1af 100644
--- a/trunk/arch/m68knommu/platform/528x/config.c
+++ b/trunk/arch/m68knommu/platform/528x/config.c
@@ -31,6 +31,10 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
+/***************************************************************************/
+
 static struct mcf_platform_uart m528x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -167,14 +171,6 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
-static void m528x_cpu_reset(void)
-{
-	local_irq_disable();
-	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
-}
-
-/***************************************************************************/
-
 #ifdef CONFIG_WILDFIRE
 void wildfire_halt(void)
 {
@@ -218,7 +214,6 @@ void __init config_BSP(char *commandp, int size)
 
 static int __init init_BSP(void)
 {
-	mach_reset = m528x_cpu_reset;
 	m528x_uarts_init();
 	m528x_fec_init();
 	platform_add_devices(m528x_devices, ARRAY_SIZE(m528x_devices));
diff --git a/trunk/arch/m68knommu/platform/5307/config.c b/trunk/arch/m68knommu/platform/5307/config.c
index 39da9e9ff674..44803bf70a6e 100644
--- a/trunk/arch/m68knommu/platform/5307/config.c
+++ b/trunk/arch/m68knommu/platform/5307/config.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,6 +22,8 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -116,17 +119,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
-void m5307_cpu_reset(void)
-{
-	local_irq_disable();
-	/* Set watchdog to soft reset, and enabled */
-	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
-	for (;;)
-		/* wait for watchdog to timeout */;
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -142,7 +134,7 @@ void __init config_BSP(char *commandp, int size)
 	mcf_timerlevel = 6;
 #endif
 
-	mach_reset = m5307_cpu_reset;
+	mach_reset = coldfire_reset;
 
 #ifdef CONFIG_BDM_DISABLE
 	/*
diff --git a/trunk/arch/m68knommu/platform/532x/config.c b/trunk/arch/m68knommu/platform/532x/config.c
index cdb761971f7a..591f2f801134 100644
--- a/trunk/arch/m68knommu/platform/532x/config.c
+++ b/trunk/arch/m68knommu/platform/532x/config.c
@@ -31,6 +31,8 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -162,14 +164,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
-static void m532x_cpu_reset(void)
-{
-	local_irq_disable();
-	__raw_writeb(MCF_RCR_SWRESET, MCF_RCR);
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -187,7 +181,7 @@ void __init config_BSP(char *commandp, int size)
 
 	mcf_timervector = 64+32;
 	mcf_profilevector = 64+33;
-	mach_reset = m532x_cpu_reset;
+	mach_reset = coldfire_reset;
 
 #ifdef CONFIG_BDM_DISABLE
 	/*
diff --git a/trunk/arch/m68knommu/platform/5407/config.c b/trunk/arch/m68knommu/platform/5407/config.c
index b41d942bf8d0..0ee8c1a200c8 100644
--- a/trunk/arch/m68knommu/platform/5407/config.c
+++ b/trunk/arch/m68knommu/platform/5407/config.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -20,6 +21,8 @@
 
 /***************************************************************************/
 
+void coldfire_reset(void);
+
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -107,17 +110,6 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
-void m5407_cpu_reset(void)
-{
-	local_irq_disable();
-	/* set watchdog to soft reset, and enabled */
-	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
-	for (;;)
-		/* wait for watchdog to timeout */;
-}
-
-/***************************************************************************/
-
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -129,7 +121,7 @@ void __init config_BSP(char *commandp, int size)
 	mcf_timerlevel = 6;
 #endif
 
-	mach_reset = m5407_cpu_reset;
+	mach_reset = coldfire_reset;
 }
 
 /***************************************************************************/
diff --git a/trunk/arch/m68knommu/platform/coldfire/vectors.c b/trunk/arch/m68knommu/platform/coldfire/vectors.c
index bdca0297fa9a..6cf894620234 100644
--- a/trunk/arch/m68knommu/platform/coldfire/vectors.c
+++ b/trunk/arch/m68knommu/platform/coldfire/vectors.c
@@ -96,3 +96,10 @@ void ack_vector(unsigned int irq)
 }
 
 /***************************************************************************/
+
+void coldfire_reset(void)
+{
+	HARD_RESET_NOW();
+}
+
+/***************************************************************************/
diff --git a/trunk/arch/powerpc/include/asm/hw_irq.h b/trunk/arch/powerpc/include/asm/hw_irq.h
index 53512374e1c9..20a44d0c9fdd 100644
--- a/trunk/arch/powerpc/include/asm/hw_irq.h
+++ b/trunk/arch/powerpc/include/asm/hw_irq.h
@@ -156,6 +156,8 @@ static inline void clear_perf_counter_pending(void)
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
+extern void perf_counter_do_pending(void);
+
 #else
 
 static inline unsigned long test_perf_counter_pending(void)
@@ -165,6 +167,7 @@ static inline unsigned long test_perf_counter_pending(void)
 
 static inline void set_perf_counter_pending(void) {}
 static inline void clear_perf_counter_pending(void) {}
+static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
 #endif	/* __KERNEL__ */
diff --git a/trunk/arch/powerpc/kernel/irq.c b/trunk/arch/powerpc/kernel/irq.c
index 844d3f882a15..feff792ed0f9 100644
--- a/trunk/arch/powerpc/kernel/irq.c
+++ b/trunk/arch/powerpc/kernel/irq.c
@@ -53,7 +53,6 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
-#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/trunk/arch/x86/kernel/apic/io_apic.c b/trunk/arch/x86/kernel/apic/io_apic.c
index ef8d9290c7ea..94605e7f6a54 100644
--- a/trunk/arch/x86/kernel/apic/io_apic.c
+++ b/trunk/arch/x86/kernel/apic/io_apic.c
@@ -187,8 +187,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
-		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+		alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+		alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c
index d1c636bf31a7..be5ae80f897f 100644
--- a/trunk/arch/x86/kernel/setup.c
+++ b/trunk/arch/x86/kernel/setup.c
@@ -301,15 +301,13 @@ static void __init reserve_brk(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
-#ifdef CONFIG_X86_32
-
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
 
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
 	u64 ramdisk_here;
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
@@ -365,14 +363,13 @@ static void __init relocate_initrd(void)
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
-#endif
 
 static void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
 	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
-	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
 
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
@@ -402,14 +399,8 @@ static void __init reserve_initrd(void)
 		return;
 	}
 
-#ifdef CONFIG_X86_32
 	relocate_initrd();
-#else
-	printk(KERN_ERR "initrd extends beyond end of memory "
-	       "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
-	       ramdisk_end, end_of_lowmem);
-	initrd_start = 0;
-#endif
+
 	free_early(ramdisk_image, ramdisk_end);
 }
 #else
diff --git a/trunk/block/blk-core.c b/trunk/block/blk-core.c
index f6452f692501..d17d71c71d4f 100644
--- a/trunk/block/blk-core.c
+++ b/trunk/block/blk-core.c
@@ -884,10 +884,9 @@ EXPORT_SYMBOL(blk_get_request);
 
 /**
  * blk_make_request - given a bio, allocate a corresponding struct request.
- * @q: target request queue
+ *
  * @bio:  The bio describing the memory mappings that will be submitted for IO.
  *        It may be a chained-bio properly constructed by block/bio layer.
- * @gfp_mask: gfp flags to be used for memory allocation
  *
  * blk_make_request is the parallel of generic_make_request for BLOCK_PC
  * type commands. Where the struct request needs to be farther initialized by
@@ -1873,14 +1872,14 @@ EXPORT_SYMBOL(blk_fetch_request);
 
 /**
  * blk_update_request - Special helper function for request stacking drivers
- * @req:      the request being processed
+ * @rq:	      the request being processed
  * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete @req
+ * @nr_bytes: number of bytes to complete @rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @req, but doesn't complete
- *     the request structure even if @req doesn't have leftover.
- *     If @req has leftover, sets it up for the next range of segments.
+ *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
+ *     the request structure even if @rq doesn't have leftover.
+ *     If @rq has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
@@ -2146,7 +2145,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
 /**
  * blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @error: %0 for success, < %0 for error
+ * @err: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.
@@ -2167,7 +2166,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_all);
 /**
  * blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
+ * @err: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
@@ -2204,7 +2203,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
  * __blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @error: %0 for success, < %0 for error
+ * @err: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.  Must be called with queue lock held.
@@ -2225,7 +2224,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request_all);
 /**
  * __blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
+ * @err: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
diff --git a/trunk/block/blk-settings.c b/trunk/block/blk-settings.c
index d71cedc09c4e..1c4df9bf6813 100644
--- a/trunk/block/blk-settings.c
+++ b/trunk/block/blk-settings.c
@@ -343,7 +343,7 @@ EXPORT_SYMBOL(blk_queue_physical_block_size);
 /**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
- * @offset: alignment offset in bytes
+ * @alignment:	alignment offset in bytes
  *
  * Description:
  *   Some devices are naturally misaligned to compensate for things like
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
 /**
  * blk_queue_io_min - set minimum request size for the queue
  * @q:	the request queue for the device
- * @min:  smallest I/O size in bytes
+ * @io_min:  smallest I/O size in bytes
  *
  * Description:
  *   Some devices have an internal block size bigger than the reported
@@ -385,7 +385,7 @@ EXPORT_SYMBOL(blk_queue_io_min);
 /**
  * blk_queue_io_opt - set optimal request size for the queue
  * @q:	the request queue for the device
- * @opt:  optimal request size in bytes
+ * @io_opt:  optimal request size in bytes
  *
  * Description:
  *   Drivers can call this function to set the preferred I/O request
diff --git a/trunk/drivers/ieee1394/dv1394.c b/trunk/drivers/ieee1394/dv1394.c
index 2cd00b5b45b4..823a6297a1af 100644
--- a/trunk/drivers/ieee1394/dv1394.c
+++ b/trunk/drivers/ieee1394/dv1394.c
@@ -1789,13 +1789,12 @@ static int dv1394_open(struct inode *inode, struct file *file)
 	} else {
 		/* look up the card by ID */
 		unsigned long flags;
-		int idx = ieee1394_file_to_instance(file);
 
 		spin_lock_irqsave(&dv1394_cards_lock, flags);
 		if (!list_empty(&dv1394_cards)) {
 			struct video_card *p;
 			list_for_each_entry(p, &dv1394_cards, list) {
-				if ((p->id) == idx) {
+				if ((p->id) == ieee1394_file_to_instance(file)) {
 					video = p;
 					break;
 				}
@@ -1804,7 +1803,7 @@ static int dv1394_open(struct inode *inode, struct file *file)
 		spin_unlock_irqrestore(&dv1394_cards_lock, flags);
 
 		if (!video) {
-			debug_printk("dv1394: OHCI card %d not found", idx);
+			debug_printk("dv1394: OHCI card %d not found", ieee1394_file_to_instance(file));
 			return -ENODEV;
 		}
 
diff --git a/trunk/drivers/ieee1394/ieee1394_core.h b/trunk/drivers/ieee1394/ieee1394_core.h
index 28b9f58bafd2..21d50f73a210 100644
--- a/trunk/drivers/ieee1394/ieee1394_core.h
+++ b/trunk/drivers/ieee1394/ieee1394_core.h
@@ -5,7 +5,6 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/types.h>
-#include <linux/cdev.h>
 #include <asm/atomic.h>
 
 #include "hosts.h"
@@ -156,10 +155,7 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
  */
 static inline unsigned char ieee1394_file_to_instance(struct file *file)
 {
-	int idx = cdev_index(file->f_path.dentry->d_inode);
-	if (idx < 0)
-		idx = 0;
-	return idx;
+	return file->f_path.dentry->d_inode->i_cindex;
 }
 
 extern int hpsb_disable_irm;
diff --git a/trunk/drivers/usb/core/inode.c b/trunk/drivers/usb/core/inode.c
index ffe75e83787c..dff5760a37f6 100644
--- a/trunk/drivers/usb/core/inode.c
+++ b/trunk/drivers/usb/core/inode.c
@@ -39,7 +39,6 @@
 #include <linux/parser.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <asm/byteorder.h>
 #include "usb.h"
 #include "hcd.h"
@@ -266,13 +265,9 @@ static int remount(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 	}
 
-	lock_kernel();
-
 	if (usbfs_mount && usbfs_mount->mnt_sb)
 		update_sb(usbfs_mount->mnt_sb);
 
-	unlock_kernel();
-
 	return 0;
 }
 
diff --git a/trunk/fs/adfs/adfs.h b/trunk/fs/adfs/adfs.h
index a6665f37f456..e0a85dbeeb88 100644
--- a/trunk/fs/adfs/adfs.h
+++ b/trunk/fs/adfs/adfs.h
@@ -53,7 +53,6 @@ struct adfs_dir_ops {
 	int	(*update)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*create)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*remove)(struct adfs_dir *dir, struct object_info *obj);
-	int	(*sync)(struct adfs_dir *dir);
 	void	(*free)(struct adfs_dir *dir);
 };
 
@@ -91,8 +90,7 @@ extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
 
-extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
-			   int wait);
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
 
 /* file.c */
 extern const struct inode_operations adfs_file_inode_operations;
diff --git a/trunk/fs/adfs/dir.c b/trunk/fs/adfs/dir.c
index 4d4073447d1a..e867ccf37246 100644
--- a/trunk/fs/adfs/dir.c
+++ b/trunk/fs/adfs/dir.c
@@ -83,7 +83,7 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 }
 
 int
-adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
+adfs_dir_update(struct super_block *sb, struct object_info *obj)
 {
 	int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
@@ -106,12 +106,6 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 	ret = ops->update(&dir, obj);
 	write_unlock(&adfs_dir_lock);
 
-	if (wait) {
-		int err = ops->sync(&dir);
-		if (!ret)
-			ret = err;
-	}
-
 	ops->free(&dir);
 out:
 #endif
@@ -205,7 +199,7 @@ const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= adfs_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= file_fsync,
 };
 
 static int
diff --git a/trunk/fs/adfs/dir_f.c b/trunk/fs/adfs/dir_f.c
index 31df6adf0de6..ea7df2146921 100644
--- a/trunk/fs/adfs/dir_f.c
+++ b/trunk/fs/adfs/dir_f.c
@@ -437,22 +437,6 @@ adfs_f_update(struct adfs_dir *dir, struct object_info *obj)
 #endif
 }
 
-static int
-adfs_f_sync(struct adfs_dir *dir)
-{
-	int err = 0;
-	int i;
-
-	for (i = dir->nr_buffers - 1; i >= 0; i--) {
-		struct buffer_head *bh = dir->bh[i];
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-			err = -EIO;
-	}
-
-	return err;
-}
-
 static void
 adfs_f_free(struct adfs_dir *dir)
 {
@@ -472,6 +456,5 @@ struct adfs_dir_ops adfs_f_dir_ops = {
 	.setpos		= adfs_f_setpos,
 	.getnext	= adfs_f_getnext,
 	.update		= adfs_f_update,
-	.sync		= adfs_f_sync,
 	.free		= adfs_f_free
 };
diff --git a/trunk/fs/adfs/dir_fplus.c b/trunk/fs/adfs/dir_fplus.c
index 139e0f345f18..1ec644e32df9 100644
--- a/trunk/fs/adfs/dir_fplus.c
+++ b/trunk/fs/adfs/dir_fplus.c
@@ -161,22 +161,6 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
 	return ret;
 }
 
-static int
-adfs_fplus_sync(struct adfs_dir *dir)
-{
-	int err = 0;
-	int i;
-
-	for (i = dir->nr_buffers - 1; i >= 0; i--) {
-		struct buffer_head *bh = dir->bh[i];
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-			err = -EIO;
-	}
-
-	return err;
-}
-
 static void
 adfs_fplus_free(struct adfs_dir *dir)
 {
@@ -191,6 +175,5 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
 	.read		= adfs_fplus_read,
 	.setpos		= adfs_fplus_setpos,
 	.getnext	= adfs_fplus_getnext,
-	.sync		= adfs_fplus_sync,
 	.free		= adfs_fplus_free
 };
diff --git a/trunk/fs/adfs/file.c b/trunk/fs/adfs/file.c
index 8224d54a2afb..36e381c6a99a 100644
--- a/trunk/fs/adfs/file.c
+++ b/trunk/fs/adfs/file.c
@@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_fsync,
+	.fsync		= file_fsync,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.splice_read	= generic_file_splice_read,
diff --git a/trunk/fs/adfs/inode.c b/trunk/fs/adfs/inode.c
index 05b3a677201d..e647200262a2 100644
--- a/trunk/fs/adfs/inode.c
+++ b/trunk/fs/adfs/inode.c
@@ -376,7 +376,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
  * The adfs-specific inode data has already been updated by
  * adfs_notify_change()
  */
-int adfs_write_inode(struct inode *inode, int wait)
+int adfs_write_inode(struct inode *inode, int unused)
 {
 	struct super_block *sb = inode->i_sb;
 	struct object_info obj;
@@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int wait)
 	obj.attr	= ADFS_I(inode)->attr;
 	obj.size	= inode->i_size;
 
-	ret = adfs_dir_update(sb, &obj, wait);
+	ret = adfs_dir_update(sb, &obj);
 	unlock_kernel();
 	return ret;
 }
diff --git a/trunk/fs/adfs/map.c b/trunk/fs/adfs/map.c
index 568081b93f73..92ab4fbc2031 100644
--- a/trunk/fs/adfs/map.c
+++ b/trunk/fs/adfs/map.c
@@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 #define GET_FRAG_ID(_map,_start,_idmask)				\
 	({								\
 		unsigned char *_m = _map + (_start >> 3);		\
-		u32 _frag = get_unaligned_le32(_m);			\
+		u32 _frag = get_unaligned((u32 *)_m);			\
 		_frag >>= (_start & 7);					\
 		_frag & _idmask;					\
 	})
diff --git a/trunk/fs/adfs/super.c b/trunk/fs/adfs/super.c
index 0ec5aaf47aa7..dd9becca4241 100644
--- a/trunk/fs/adfs/super.c
+++ b/trunk/fs/adfs/super.c
@@ -132,15 +132,11 @@ static void adfs_put_super(struct super_block *sb)
 	int i;
 	struct adfs_sb_info *asb = ADFS_SB(sb);
 
-	lock_kernel();
-
 	for (i = 0; i < asb->s_map_size; i++)
 		brelse(asb->s_map[i].dm_bh);
 	kfree(asb->s_map);
 	kfree(asb);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
diff --git a/trunk/fs/affs/affs.h b/trunk/fs/affs/affs.h
index e511dc621a2e..1a2d5e3c7f4e 100644
--- a/trunk/fs/affs/affs.h
+++ b/trunk/fs/affs/affs.h
@@ -182,7 +182,6 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
-int		affs_file_fsync(struct file *, struct dentry *, int);
 
 /* dir.c */
 
diff --git a/trunk/fs/affs/dir.c b/trunk/fs/affs/dir.c
index 8ca8f3a55599..7b36904dbeac 100644
--- a/trunk/fs/affs/dir.c
+++ b/trunk/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= affs_readdir,
-	.fsync		= affs_file_fsync,
+	.fsync		= file_fsync,
 };
 
 /*
diff --git a/trunk/fs/affs/file.c b/trunk/fs/affs/file.c
index 184e55c1c9ba..9246cb4aa018 100644
--- a/trunk/fs/affs/file.c
+++ b/trunk/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
-	.fsync		= affs_file_fsync,
+	.fsync		= file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -915,15 +915,3 @@ affs_truncate(struct inode *inode)
 	}
 	affs_free_prealloc(inode);
 }
-
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
-{
-	struct inode * inode = dentry->d_inode;
-	int ret, err;
-
-	ret = write_inode_now(inode, 0);
-	err = sync_blockdev(inode->i_sb->s_bdev);
-	if (!ret)
-		ret = err;
-	return ret;
-}
diff --git a/trunk/fs/affs/super.c b/trunk/fs/affs/super.c
index 104fdcb3a7fc..63f5183f263b 100644
--- a/trunk/fs/affs/super.c
+++ b/trunk/fs/affs/super.c
@@ -16,7 +16,6 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
@@ -24,68 +23,50 @@ extern struct timezone sys_tz;
 static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
-static void
-affs_commit_super(struct super_block *sb, int clean)
-{
-	struct affs_sb_info *sbi = AFFS_SB(sb);
-	struct buffer_head *bh = sbi->s_root_bh;
-	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
-
-	tail->bm_flag = cpu_to_be32(clean);
-	secs_to_datestamp(get_seconds(), &tail->disk_change);
-	affs_fix_checksum(sb, bh);
-	mark_buffer_dirty(bh);
-}
-
 static void
 affs_put_super(struct super_block *sb)
 {
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	pr_debug("AFFS: put_super()\n");
 
-	lock_kernel();
-
-	if (!(sb->s_flags & MS_RDONLY))
-		affs_commit_super(sb, 1);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
+		secs_to_datestamp(get_seconds(),
+				  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
+		affs_fix_checksum(sb, sbi->s_root_bh);
+		mark_buffer_dirty(sbi->s_root_bh);
+	}
 
 	kfree(sbi->s_prefix);
 	affs_free_bitmap(sb);
 	affs_brelse(sbi->s_root_bh);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
+	return;
 }
 
 static void
 affs_write_super(struct super_block *sb)
 {
 	int clean = 2;
+	struct affs_sb_info *sbi = AFFS_SB(sb);
 
-	lock_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		//	if (sbi->s_bitmap[i].bm_bh) {
 		//		if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
 		//			clean = 0;
-		affs_commit_super(sb, clean);
+		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean);
+		secs_to_datestamp(get_seconds(),
+				  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
+		affs_fix_checksum(sb, sbi->s_root_bh);
+		mark_buffer_dirty(sbi->s_root_bh);
 		sb->s_dirt = !clean;	/* redo until bitmap synced */
 	} else
 		sb->s_dirt = 0;
-	unlock_super(sb);
 
 	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
 
-static int
-affs_sync_fs(struct super_block *sb, int wait)
-{
-	lock_super(sb);
-	affs_commit_super(sb, 2);
-	sb->s_dirt = 0;
-	unlock_super(sb);
-	return 0;
-}
-
 static struct kmem_cache * affs_inode_cachep;
 
 static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -143,7 +124,6 @@ static const struct super_operations affs_sops = {
 	.clear_inode	= affs_clear_inode,
 	.put_super	= affs_put_super,
 	.write_super	= affs_write_super,
-	.sync_fs	= affs_sync_fs,
 	.statfs		= affs_statfs,
 	.remount_fs	= affs_remount,
 	.show_options	= generic_show_options,
@@ -527,7 +507,6 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
-	lock_kernel();
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -535,10 +514,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	sbi->s_uid   = uid;
 	sbi->s_gid   = gid;
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
-	}
 	if (*flags & MS_RDONLY) {
 		sb->s_dirt = 1;
 		while (sb->s_dirt)
@@ -547,7 +524,6 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	} else
 		res = affs_init_bitmap(sb, flags);
 
-	unlock_kernel();
 	return res;
 }
 
diff --git a/trunk/fs/afs/mntpt.c b/trunk/fs/afs/mntpt.c
index c52be53f6946..2b9e2d03a390 100644
--- a/trunk/fs/afs/mntpt.c
+++ b/trunk/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
+		       follow_down(&nd->path.mnt, &nd->path.dentry))
 			;
 		err = 0;
 	default:
diff --git a/trunk/fs/afs/super.c b/trunk/fs/afs/super.c
index ad0514d0115f..76828e5f8a39 100644
--- a/trunk/fs/afs/super.c
+++ b/trunk/fs/afs/super.c
@@ -440,12 +440,8 @@ static void afs_put_super(struct super_block *sb)
 
 	_enter("");
 
-	lock_kernel();
-
 	afs_put_volume(as->volume);
 
-	unlock_kernel();
-
 	_leave("");
 }
 
diff --git a/trunk/fs/autofs/dirhash.c b/trunk/fs/autofs/dirhash.c
index 2316e944a109..4eb4d8dfb2f1 100644
--- a/trunk/fs/autofs/dirhash.c
+++ b/trunk/fs/autofs/dirhash.c
@@ -85,12 +85,13 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 		}
 		path.mnt = mnt;
 		path_get(&path);
-		if (!follow_down(&path)) {
+		if (!follow_down(&path.mnt, &path.dentry)) {
 			path_put(&path);
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(path.dentry) && follow_down(&path));
+		while (d_mountpoint(path.dentry) &&
+		       follow_down(&path.mnt, &path.dentry))
 			;
 		umount_ok = may_umount(path.mnt);
 		path_put(&path);
diff --git a/trunk/fs/autofs4/autofs_i.h b/trunk/fs/autofs4/autofs_i.h
index 8f7cdde41733..b7ff33c63101 100644
--- a/trunk/fs/autofs4/autofs_i.h
+++ b/trunk/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct path *path)
+static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 {
 	int res = 0;
 
-	while (d_mountpoint(path->dentry)) {
-		int followed = follow_down(path);
+	while (d_mountpoint(*dentry)) {
+		int followed = follow_down(mnt, dentry);
 		if (!followed)
 			break;
 		res = 1;
diff --git a/trunk/fs/autofs4/dev-ioctl.c b/trunk/fs/autofs4/dev-ioctl.c
index f3da2eb51f56..84168c0dcc2d 100644
--- a/trunk/fs/autofs4/dev-ioctl.c
+++ b/trunk/fs/autofs4/dev-ioctl.c
@@ -192,42 +192,77 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 	return 0;
 }
 
-static int find_autofs_mount(const char *pathname,
-			     struct path *res,
-			     int test(struct path *path, void *data),
-			     void *data)
+/*
+ * Walk down the mount stack looking for an autofs mount that
+ * has the requested device number (aka. new_encode_dev(sb->s_dev).
+ */
+static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
 {
-	struct path path;
-	int err = kern_path(pathname, 0, &path);
-	if (err)
-		return err;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct super_block *sb;
+	dev_t s_dev;
+	unsigned int err;
+
 	err = -ENOENT;
-	while (path.dentry == path.mnt->mnt_root) {
-		if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
-			if (test(&path, data)) {
-				path_get(&path);
-				if (!err) /* already found some */
-					path_put(res);
-				*res = path;
+
+	/* Lookup the dentry name at the base of our mount point */
+	dentry = d_lookup(nd->path.dentry, &nd->last);
+	if (!dentry)
+		goto out;
+
+	dput(nd->path.dentry);
+	nd->path.dentry = dentry;
+
+	/* And follow the mount stack looking for our autofs mount */
+	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+		inode = nd->path.dentry->d_inode;
+		if (!inode)
+			break;
+
+		sb = inode->i_sb;
+		s_dev = new_encode_dev(sb->s_dev);
+		if (devno == s_dev) {
+			if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
 				err = 0;
+				break;
 			}
 		}
-		if (!follow_up(&path))
-			break;
 	}
-	path_put(&path);
+out:
 	return err;
 }
 
-static int test_by_dev(struct path *path, void *p)
+/*
+ * Walk down the mount stack looking for an autofs mount that
+ * has the requested mount type (ie. indirect, direct or offset).
+ */
+static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
 {
-	return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
-}
+	struct dentry *dentry;
+	struct autofs_info *ino;
+	unsigned int err;
 
-static int test_by_type(struct path *path, void *p)
-{
-	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
-	return ino && ino->sbi->type & *(unsigned *)p;
+	err = -ENOENT;
+
+	/* Lookup the dentry name at the base of our mount point */
+	dentry = d_lookup(nd->path.dentry, &nd->last);
+	if (!dentry)
+		goto out;
+
+	dput(nd->path.dentry);
+	nd->path.dentry = dentry;
+
+	/* And follow the mount stack looking for our autofs mount */
+	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+		ino = autofs4_dentry_ino(nd->path.dentry);
+		if (ino && ino->sbi->type & type) {
+			err = 0;
+			break;
+		}
+	}
+out:
+	return err;
 }
 
 static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -248,25 +283,31 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
  * Open a file descriptor on the autofs mount point corresponding
  * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
  */
-static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
+static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
 {
+	struct file *filp;
+	struct nameidata nd;
 	int err, fd;
 
 	fd = get_unused_fd();
 	if (likely(fd >= 0)) {
-		struct file *filp;
-		struct path path;
-
-		err = find_autofs_mount(name, &path, test_by_dev, &devid);
+		/* Get nameidata of the parent directory */
+		err = path_lookup(path, LOOKUP_PARENT, &nd);
 		if (err)
 			goto out;
 
 		/*
-		 * Find autofs super block that has the device number
+		 * Search down, within the parent, looking for an
+		 * autofs super block that has the device number
 		 * corresponding to the autofs fs we want to open.
 		 */
+		err = autofs_dev_ioctl_find_super(&nd, devid);
+		if (err) {
+			path_put(&nd.path);
+			goto out;
+		}
 
-		filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
+		filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
 				   current_cred());
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
@@ -299,7 +340,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
 	param->ioctlfd = -1;
 
 	path = param->path;
-	devid = new_decode_dev(param->openmount.devid);
+	devid = param->openmount.devid;
 
 	err = 0;
 	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -434,7 +475,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 				      struct autofs_dev_ioctl *param)
 {
 	struct autofs_info *ino;
-	struct path path;
+	struct nameidata nd;
+	const char *path;
 	dev_t devid;
 	int err = -ENOENT;
 
@@ -443,24 +485,32 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		goto out;
 	}
 
-	devid = sbi->sb->s_dev;
+	path = param->path;
+	devid = new_encode_dev(sbi->sb->s_dev);
 
 	param->requester.uid = param->requester.gid = -1;
 
-	err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
+	/* Get nameidata of the parent directory */
+	err = path_lookup(path, LOOKUP_PARENT, &nd);
 	if (err)
 		goto out;
 
-	ino = autofs4_dentry_ino(path.dentry);
+	err = autofs_dev_ioctl_find_super(&nd, devid);
+	if (err)
+		goto out_release;
+
+	ino = autofs4_dentry_ino(nd.path.dentry);
 	if (ino) {
 		err = 0;
-		autofs4_expire_wait(path.dentry);
+		autofs4_expire_wait(nd.path.dentry);
 		spin_lock(&sbi->fs_lock);
 		param->requester.uid = ino->uid;
 		param->requester.gid = ino->gid;
 		spin_unlock(&sbi->fs_lock);
 	}
-	path_put(&path);
+
+out_release:
+	path_put(&nd.path);
 out:
 	return err;
 }
@@ -519,8 +569,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 					 struct autofs_sb_info *sbi,
 					 struct autofs_dev_ioctl *param)
 {
-	struct path path;
-	const char *name;
+	struct nameidata nd;
+	const char *path;
 	unsigned int type;
 	unsigned int devid, magic;
 	int err = -ENOENT;
@@ -530,46 +580,71 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		goto out;
 	}
 
-	name = param->path;
+	path = param->path;
 	type = param->ismountpoint.in.type;
 
 	param->ismountpoint.out.devid = devid = 0;
 	param->ismountpoint.out.magic = magic = 0;
 
 	if (!fp || param->ioctlfd == -1) {
-		if (autofs_type_any(type))
-			err = kern_path(name, LOOKUP_FOLLOW, &path);
-		else
-			err = find_autofs_mount(name, &path, test_by_type, &type);
-		if (err)
-			goto out;
-		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
+		if (autofs_type_any(type)) {
+			struct super_block *sb;
+
+			err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+			if (err)
+				goto out;
+
+			sb = nd.path.dentry->d_sb;
+			devid = new_encode_dev(sb->s_dev);
+		} else {
+			struct autofs_info *ino;
+
+			err = path_lookup(path, LOOKUP_PARENT, &nd);
+			if (err)
+				goto out;
+
+			err = autofs_dev_ioctl_find_sbi_type(&nd, type);
+			if (err)
+				goto out_release;
+
+			ino = autofs4_dentry_ino(nd.path.dentry);
+			devid = autofs4_get_dev(ino->sbi);
+		}
+
 		err = 0;
-		if (path.dentry->d_inode &&
-		    path.mnt->mnt_root == path.dentry) {
+		if (nd.path.dentry->d_inode &&
+		    nd.path.mnt->mnt_root == nd.path.dentry) {
 			err = 1;
-			magic = path.dentry->d_inode->i_sb->s_magic;
+			magic = nd.path.dentry->d_inode->i_sb->s_magic;
 		}
 	} else {
-		dev_t dev = sbi->sb->s_dev;
+		dev_t dev = autofs4_get_dev(sbi);
 
-		err = find_autofs_mount(name, &path, test_by_dev, &dev);
+		err = path_lookup(path, LOOKUP_PARENT, &nd);
 		if (err)
 			goto out;
 
-		devid = new_encode_dev(dev);
+		err = autofs_dev_ioctl_find_super(&nd, dev);
+		if (err)
+			goto out_release;
+
+		devid = dev;
 
-		err = have_submounts(path.dentry);
+		err = have_submounts(nd.path.dentry);
 
-		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-			if (follow_down(&path))
-				magic = path.mnt->mnt_sb->s_magic;
+		if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
+			if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
+				struct inode *inode = nd.path.dentry->d_inode;
+				magic = inode->i_sb->s_magic;
+			}
 		}
 	}
 
 	param->ismountpoint.out.devid = devid;
 	param->ismountpoint.out.magic = magic;
-	path_put(&path);
+
+out_release:
+	path_put(&nd.path);
 out:
 	return err;
 }
diff --git a/trunk/fs/autofs4/expire.c b/trunk/fs/autofs4/expire.c
index aa39ae83f019..3077d8f16523 100644
--- a/trunk/fs/autofs4/expire.c
+++ b/trunk/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct dentry *top = dentry;
-	struct path path = {.mnt = mnt, .dentry = dentry};
 	int status = 1;
 
 	DPRINTK("dentry %p %.*s",
 		dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
-	path_get(&path);
+	mntget(mnt);
+	dget(dentry);
 
-	if (!follow_down(&path))
+	if (!follow_down(&mnt, &dentry))
 		goto done;
 
-	if (is_autofs4_dentry(path.dentry)) {
-		struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
+	if (is_autofs4_dentry(dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
 		/* This is an autofs submount, we can't expire it */
 		if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		 * Otherwise it's an offset mount and we need to check
 		 * if we can umount its mount, if there is one.
 		 */
-		if (!d_mountpoint(path.dentry)) {
+		if (!d_mountpoint(dentry)) {
 			status = 0;
 			goto done;
 		}
@@ -86,7 +86,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	status = 0;
 done:
 	DPRINTK("returning = %d", status);
-	path_put(&path);
+	dput(dentry);
+	mntput(mnt);
 	return status;
 }
 
diff --git a/trunk/fs/autofs4/root.c b/trunk/fs/autofs4/root.c
index b96a3c57359d..e383bf0334f1 100644
--- a/trunk/fs/autofs4/root.c
+++ b/trunk/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to break out of follow_down() at the autofs mount trigger
+	 * to beeak out of follow_down() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path))
+			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
 				goto done;
 			goto follow;
 		}
@@ -230,7 +230,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 * to follow it.
 	 */
 	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path)) {
+		if (!autofs4_follow_mount(&nd->path.mnt,
+					  &nd->path.dentry)) {
 			status = -ENOENT;
 			goto out_error;
 		}
diff --git a/trunk/fs/befs/linuxvfs.c b/trunk/fs/befs/linuxvfs.c
index 9367b6297d84..76afd0d6b86c 100644
--- a/trunk/fs/befs/linuxvfs.c
+++ b/trunk/fs/befs/linuxvfs.c
@@ -737,8 +737,6 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	kfree(BEFS_SB(sb)->mount_opts.iocharset);
 	BEFS_SB(sb)->mount_opts.iocharset = NULL;
 
@@ -749,8 +747,7 @@ befs_put_super(struct super_block *sb)
 
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
+	return;
 }
 
 /* Allocate private field of the superblock, fill it.
diff --git a/trunk/fs/bfs/dir.c b/trunk/fs/bfs/dir.c
index 54bd07d44e68..4dd1b623f937 100644
--- a/trunk/fs/bfs/dir.c
+++ b/trunk/fs/bfs/dir.c
@@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= file_fsync,
 	.llseek		= generic_file_llseek,
 };
 
@@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 		inode->i_nlink = 1;
 	}
 	de->ino = 0;
-	mark_buffer_dirty_inode(bh, dir);
+	mark_buffer_dirty(bh);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
@@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		inode_dec_link_count(new_inode);
 	}
-	mark_buffer_dirty_inode(old_bh, old_dir);
+	mark_buffer_dirty(old_bh);
 	error = 0;
 
 end_rename:
@@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
 				for (i = 0; i < BFS_NAMELEN; i++)
 					de->name[i] =
 						(i < namelen) ? name[i] : 0;
-				mark_buffer_dirty_inode(bh, dir);
+				mark_buffer_dirty(bh);
 				brelse(bh);
 				return 0;
 			}
diff --git a/trunk/fs/bfs/inode.c b/trunk/fs/bfs/inode.c
index 6f60336c6628..cc4062d12ca2 100644
--- a/trunk/fs/bfs/inode.c
+++ b/trunk/fs/bfs/inode.c
@@ -30,7 +30,6 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
 
-static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -98,15 +97,14 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	return ERR_PTR(-EIO);
 }
 
-static int bfs_write_inode(struct inode *inode, int wait)
+static int bfs_write_inode(struct inode *inode, int unused)
 {
-	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
         unsigned long i_sblock;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int block, off;
-	int err = 0;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 
         dprintf("ino=%08x\n", ino);
 
@@ -147,14 +145,9 @@ static int bfs_write_inode(struct inode *inode, int wait)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
-	if (wait) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-			err = -EIO;
-	}
 	brelse(bh);
 	mutex_unlock(&info->bfs_lock);
-	return err;
+	return 0;
 }
 
 static void bfs_delete_inode(struct inode *inode)
@@ -216,26 +209,6 @@ static void bfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
-static int bfs_sync_fs(struct super_block *sb, int wait)
-{
-	struct bfs_sb_info *info = BFS_SB(sb);
-
-	mutex_lock(&info->bfs_lock);
-	mark_buffer_dirty(info->si_sbh);
-	sb->s_dirt = 0;
-	mutex_unlock(&info->bfs_lock);
-
-	return 0;
-}
-
-static void bfs_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		bfs_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
-}
-
 static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
@@ -243,18 +216,11 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
-	lock_kernel();
-
-	if (s->s_dirt)
-		bfs_write_super(s);
-
 	brelse(info->si_sbh);
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -274,6 +240,17 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static void bfs_write_super(struct super_block *s)
+{
+	struct bfs_sb_info *info = BFS_SB(s);
+
+	mutex_lock(&info->bfs_lock);
+	if (!(s->s_flags & MS_RDONLY))
+		mark_buffer_dirty(info->si_sbh);
+	s->s_dirt = 0;
+	mutex_unlock(&info->bfs_lock);
+}
+
 static struct kmem_cache *bfs_inode_cachep;
 
 static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -321,7 +298,6 @@ static const struct super_operations bfs_sops = {
 	.delete_inode	= bfs_delete_inode,
 	.put_super	= bfs_put_super,
 	.write_super	= bfs_write_super,
-	.sync_fs	= bfs_sync_fs,
 	.statfs		= bfs_statfs,
 };
 
diff --git a/trunk/fs/block_dev.c b/trunk/fs/block_dev.c
index 3a6d4fb2a329..931f6b8c4b2f 100644
--- a/trunk/fs/block_dev.c
+++ b/trunk/fs/block_dev.c
@@ -176,22 +176,17 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
-int __sync_blockdev(struct block_device *bdev, int wait)
-{
-	if (!bdev)
-		return 0;
-	if (!wait)
-		return filemap_flush(bdev->bd_inode->i_mapping);
-	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
-}
-
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
-	return __sync_blockdev(bdev, 1);
+	int ret = 0;
+
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
 
@@ -204,7 +199,7 @@ int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
-		int res = sync_filesystem(sb);
+		int res = fsync_super(sb);
 		drop_super(sb);
 		return res;
 	}
@@ -246,7 +241,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		sync_filesystem(sb);
+		__fsync_super(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile
index a35eb36b32fd..94212844a9bc 100644
--- a/trunk/fs/btrfs/Makefile
+++ b/trunk/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o acl.o free-space-cache.o zlib.o \
-	   compression.o delayed-ref.o relocation.o
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o delayed-ref.o
diff --git a/trunk/fs/btrfs/acl.c b/trunk/fs/btrfs/acl.c
index 603972576f0f..cbba000dccbe 100644
--- a/trunk/fs/btrfs/acl.c
+++ b/trunk/fs/btrfs/acl.c
@@ -351,4 +351,9 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 	return 0;
 }
 
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FS_POSIX_ACL */
diff --git a/trunk/fs/btrfs/async-thread.c b/trunk/fs/btrfs/async-thread.c
index 7f88628a1a72..502c3d61de62 100644
--- a/trunk/fs/btrfs/async-thread.c
+++ b/trunk/fs/btrfs/async-thread.c
@@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
-		worker->workers = workers;
 		worker->task = kthread_run(worker_loop, worker,
 					   "btrfs-%s-%d", workers->name,
 					   workers->num_workers + i);
+		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			kfree(worker);
 			ret = PTR_ERR(worker->task);
diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h
index acb4f3517582..b30986f00b9d 100644
--- a/trunk/fs/btrfs/btrfs_inode.h
+++ b/trunk/fs/btrfs/btrfs_inode.h
@@ -72,9 +72,6 @@ struct btrfs_inode {
 	 */
 	struct list_head ordered_operations;
 
-	/* node for the red-black tree that links inodes in subvolume root */
-	struct rb_node rb_node;
-
 	/* the space_info for where this inode's data allocations are done */
 	struct btrfs_space_info *space_info;
 
@@ -157,4 +154,5 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 	BTRFS_I(inode)->disk_i_size = size;
 }
 
+
 #endif
diff --git a/trunk/fs/btrfs/compression.c b/trunk/fs/btrfs/compression.c
index de1e2fd32080..ab07627084f1 100644
--- a/trunk/fs/btrfs/compression.c
+++ b/trunk/fs/btrfs/compression.c
@@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode,
 	u32 csum;
 	u32 *cb_sum = &cb->sums;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+	if (btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
 	for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			if (!btrfs_test_flag(inode, NODATASUM)) {
 				btrfs_lookup_bio_sums(root, inode, comp_bio,
 						      sums);
 			}
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+	if (!btrfs_test_flag(inode, NODATASUM))
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/trunk/fs/btrfs/crc32c.h b/trunk/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/trunk/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CRC32C__
+#define __BTRFS_CRC32C__
+#include <linux/crc32c.h>
+
+/*
+ * this file used to do more for selecting the HW version of crc32c,
+ * perhaps it will one day again soon.
+ */
+#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
+#endif
+
diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c
index 60a45f3a4e91..fedf8b9f03a2 100644
--- a/trunk/fs/btrfs/ctree.c
+++ b/trunk/fs/btrfs/ctree.c
@@ -197,7 +197,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int ret = 0;
 	int level;
-	struct btrfs_disk_key disk_key;
+	struct btrfs_root *new_root;
+
+	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
+	if (!new_root)
+		return -ENOMEM;
+
+	memcpy(new_root, root, sizeof(*new_root));
+	new_root->root_key.objectid = new_root_objectid;
 
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
@@ -205,37 +212,28 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	if (level == 0)
-		btrfs_item_key(buf, &disk_key, 0);
-	else
-		btrfs_node_key(buf, &disk_key, 0);
 
-	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
-				     new_root_objectid, &disk_key, level,
-				     buf->start, 0);
-	if (IS_ERR(cow))
+	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+				     new_root_objectid, trans->transid,
+				     level, buf->start, 0);
+	if (IS_ERR(cow)) {
+		kfree(new_root);
 		return PTR_ERR(cow);
+	}
 
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
-	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
-				     BTRFS_HEADER_FLAG_RELOC);
-	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
-	else
-		btrfs_set_header_owner(cow, new_root_objectid);
+	btrfs_set_header_owner(cow, new_root_objectid);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	write_extent_buffer(cow, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		ret = btrfs_inc_ref(trans, root, cow, 1);
-	else
-		ret = btrfs_inc_ref(trans, root, cow, 0);
+	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+	kfree(new_root);
 
 	if (ret)
 		return ret;
@@ -245,125 +243,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-/*
- * check if the tree block can be shared by multiple trees
- */
-int btrfs_block_can_be_shared(struct btrfs_root *root,
-			      struct extent_buffer *buf)
-{
-	/*
-	 * Tree blocks not in refernece counted trees and tree roots
-	 * are never shared. If a block was allocated after the last
-	 * snapshot and the block was not allocated by tree relocation,
-	 * we know the block is not shared.
-	 */
-	if (root->ref_cows &&
-	    buf != root->node && buf != root->commit_root &&
-	    (btrfs_header_generation(buf) <=
-	     btrfs_root_last_snapshot(&root->root_item) ||
-	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
-		return 1;
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (root->ref_cows &&
-	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
-		return 1;
-#endif
-	return 0;
-}
-
-static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct extent_buffer *buf,
-				       struct extent_buffer *cow)
-{
-	u64 refs;
-	u64 owner;
-	u64 flags;
-	u64 new_flags = 0;
-	int ret;
-
-	/*
-	 * Backrefs update rules:
-	 *
-	 * Always use full backrefs for extent pointers in tree block
-	 * allocated by tree relocation.
-	 *
-	 * If a shared tree block is no longer referenced by its owner
-	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
-	 * use full backrefs for extent pointers in tree block.
-	 *
-	 * If a tree block is been relocating
-	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
-	 * use full backrefs for extent pointers in tree block.
-	 * The reason for this is some operations (such as drop tree)
-	 * are only allowed for blocks use full backrefs.
-	 */
-
-	if (btrfs_block_can_be_shared(root, buf)) {
-		ret = btrfs_lookup_extent_info(trans, root, buf->start,
-					       buf->len, &refs, &flags);
-		BUG_ON(ret);
-		BUG_ON(refs == 0);
-	} else {
-		refs = 1;
-		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
-		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
-			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-		else
-			flags = 0;
-	}
-
-	owner = btrfs_header_owner(buf);
-	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
-	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
-
-	if (refs > 1) {
-		if ((owner == root->root_key.objectid ||
-		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
-		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-			ret = btrfs_inc_ref(trans, root, buf, 1);
-			BUG_ON(ret);
-
-			if (root->root_key.objectid ==
-			    BTRFS_TREE_RELOC_OBJECTID) {
-				ret = btrfs_dec_ref(trans, root, buf, 0);
-				BUG_ON(ret);
-				ret = btrfs_inc_ref(trans, root, cow, 1);
-				BUG_ON(ret);
-			}
-			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-		} else {
-
-			if (root->root_key.objectid ==
-			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
-			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret);
-		}
-		if (new_flags != 0) {
-			ret = btrfs_set_disk_extent_flags(trans, root,
-							  buf->start,
-							  buf->len,
-							  new_flags, 0);
-			BUG_ON(ret);
-		}
-	} else {
-		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
-			if (root->root_key.objectid ==
-			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
-			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret);
-			ret = btrfs_dec_ref(trans, root, buf, 1);
-			BUG_ON(ret);
-		}
-		clean_tree_block(trans, root, buf);
-	}
-	return 0;
-}
-
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -383,39 +262,34 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct extent_buffer **cow_ret,
 			     u64 search_start, u64 empty_size)
 {
-	struct btrfs_disk_key disk_key;
+	u64 parent_start;
 	struct extent_buffer *cow;
+	u32 nritems;
+	int ret = 0;
 	int level;
 	int unlock_orig = 0;
-	u64 parent_start;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
 	btrfs_assert_tree_locked(buf);
 
+	if (parent)
+		parent_start = parent->start;
+	else
+		parent_start = 0;
+
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
 
-	if (level == 0)
-		btrfs_item_key(buf, &disk_key, 0);
-	else
-		btrfs_node_key(buf, &disk_key, 0);
-
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		if (parent)
-			parent_start = parent->start;
-		else
-			parent_start = 0;
-	} else
-		parent_start = 0;
-
-	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
-				     root->root_key.objectid, &disk_key,
-				     level, search_start, empty_size);
+	cow = btrfs_alloc_free_block(trans, root, buf->len,
+				     parent_start, root->root_key.objectid,
+				     trans->transid, level,
+				     search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -424,53 +298,83 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
-	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
-				     BTRFS_HEADER_FLAG_RELOC);
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
-	else
-		btrfs_set_header_owner(cow, root->root_key.objectid);
+	btrfs_set_header_owner(cow, root->root_key.objectid);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	write_extent_buffer(cow, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
-	update_ref_for_cow(trans, root, buf, cow);
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	if (btrfs_header_generation(buf) != trans->transid) {
+		u32 nr_extents;
+		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
+		if (ret)
+			return ret;
+
+		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+		WARN_ON(ret);
+	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+		/*
+		 * There are only two places that can drop reference to
+		 * tree blocks owned by living reloc trees, one is here,
+		 * the other place is btrfs_drop_subtree. In both places,
+		 * we check reference count while tree block is locked.
+		 * Furthermore, if reference count is one, it won't get
+		 * increased by someone else.
+		 */
+		u32 refs;
+		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+					      buf->len, &refs);
+		BUG_ON(ret);
+		if (refs == 1) {
+			ret = btrfs_update_ref(trans, root, buf, cow,
+					       0, nritems);
+			clean_tree_block(trans, root, buf);
+		} else {
+			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+		}
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+		if (ret)
+			return ret;
+		clean_tree_block(trans, root, buf);
+	}
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+		WARN_ON(ret);
+	}
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
-		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
-		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
-			parent_start = buf->start;
-		else
-			parent_start = 0;
 
 		spin_lock(&root->node_lock);
 		root->node = cow;
 		extent_buffer_get(cow);
 		spin_unlock(&root->node_lock);
 
-		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  parent_start, root->root_key.objectid,
-				  level, 0);
+		if (buf != root->commit_root) {
+			btrfs_free_extent(trans, root, buf->start,
+					  buf->len, buf->start,
+					  root->root_key.objectid,
+					  btrfs_header_generation(buf),
+					  level, 1);
+		}
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
-		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-			parent_start = parent->start;
-		else
-			parent_start = 0;
-
-		WARN_ON(trans->transid != btrfs_header_generation(parent));
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
+		WARN_ON(trans->transid == 0);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
+		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  parent_start, root->root_key.objectid,
-				  level, 0);
+				  parent_start, btrfs_header_owner(parent),
+				  btrfs_header_generation(parent), level, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -480,18 +384,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static inline int should_cow_block(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct extent_buffer *buf)
-{
-	if (btrfs_header_generation(buf) == trans->transid &&
-	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
-	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
-	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
-		return 0;
-	return 1;
-}
-
 /*
  * cows a single block, see __btrfs_cow_block for the real work.
  * This version of it has extra checks so that a block isn't cow'd more than
@@ -519,7 +411,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	if (!should_cow_block(trans, root, buf)) {
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    btrfs_header_owner(buf) == root->root_key.objectid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
 		return 0;
 	}
@@ -575,7 +469,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 /*
  * same as comp_keys only with two btrfs_key's
  */
-int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 {
 	if (k1->objectid > k2->objectid)
 		return 1;
@@ -951,12 +845,6 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
-int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
-		     int level, int *slot)
-{
-	return bin_search(eb, key, level, slot);
-}
-
 /* given a node and slot number, this reads the blocks it points to.  The
  * extent buffer is returned with a reference taken (but unlocked).
  * NULL is returned on error.
@@ -1033,6 +921,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		root->node = child;
 		spin_unlock(&root->node_lock);
 
+		ret = btrfs_update_extent_ref(trans, root, child->start,
+					      child->len,
+					      mid->start, child->start,
+					      root->root_key.objectid,
+					      trans->transid, level - 1);
+		BUG_ON(ret);
+
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
 
@@ -1043,7 +938,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-					0, root->root_key.objectid, level, 1);
+					mid->start, root->root_key.objectid,
+					btrfs_header_generation(mid),
+					level, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -1052,7 +949,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (btrfs_header_nritems(mid) > 2)
+	if (trans->transaction->delayed_refs.flushing &&
+	    btrfs_header_nritems(mid) > 2)
 		return 0;
 
 	if (btrfs_header_nritems(mid) < 2)
@@ -1100,6 +998,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
 			u64 bytenr = right->start;
+			u64 generation = btrfs_header_generation(parent);
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
@@ -1111,9 +1010,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize, 0,
-						 root->root_key.objectid,
-						 level, 0);
+						 blocksize, parent->start,
+						 btrfs_header_owner(parent),
+						 generation, level, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -1148,6 +1047,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	}
 	if (btrfs_header_nritems(mid) == 0) {
 		/* we've managed to empty the middle node, drop it */
+		u64 root_gen = btrfs_header_generation(parent);
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
 
@@ -1159,8 +1059,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					 0, root->root_key.objectid,
-					 level, 0);
+					 parent->start,
+					 btrfs_header_owner(parent),
+					 root_gen, level, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -1536,7 +1437,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
 	int i;
 
-	if (path->keep_locks)
+	if (path->keep_locks || path->lowest_level)
 		return;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1651,7 +1552,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 		}
 		b = p->nodes[level];
 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
-		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
+		   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
 		int sret;
 
 		sret = reada_for_balance(root, p, level);
@@ -1713,17 +1614,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		lowest_unlock = 2;
 
 again:
-	if (p->search_commit_root) {
-		b = root->commit_root;
-		extent_buffer_get(b);
-		if (!p->skip_locking)
-			btrfs_tree_lock(b);
-	} else {
-		if (p->skip_locking)
-			b = btrfs_root_node(root);
-		else
-			b = btrfs_lock_root_node(root);
-	}
+	if (p->skip_locking)
+		b = btrfs_root_node(root);
+	else
+		b = btrfs_lock_root_node(root);
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -1744,9 +1638,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 			 * then we don't want to set the path blocking,
 			 * so we test it here
 			 */
-			if (!should_cow_block(trans, root, b))
+			if (btrfs_header_generation(b) == trans->transid &&
+			    btrfs_header_owner(b) == root->root_key.objectid &&
+			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
 				goto cow_done;
-
+			}
 			btrfs_set_path_blocking(p);
 
 			wret = btrfs_cow_block(trans, root, b,
@@ -1868,6 +1764,138 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 generation;
+	u32 blocksize;
+	int level;
+	int slot;
+	int key_match;
+	int ret;
+
+	eb = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
+	BUG_ON(ret);
+
+	btrfs_set_lock_blocking(eb);
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		if (level == 0 || level <= lowest_level)
+			break;
+
+		ret = bin_search(parent, &node_keys[lowest_level], level,
+				 &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		bytenr = btrfs_node_blockptr(parent, slot);
+		if (nodes[level - 1] == bytenr)
+			break;
+
+		blocksize = btrfs_level_size(root, level - 1);
+		generation = btrfs_node_ptr_generation(parent, slot);
+		btrfs_node_key_to_cpu(eb, &key, slot);
+		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+
+		if (generation == trans->transid) {
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
+		}
+
+		/*
+		 * if node keys match and node pointer hasn't been modified
+		 * in the running transaction, we can merge the path. for
+		 * blocks owened by reloc trees, the node pointer check is
+		 * skipped, this is because these blocks are fully controlled
+		 * by the space balance code, no one else can modify them.
+		 */
+		if (!nodes[level - 1] || !key_match ||
+		    (generation == trans->transid &&
+		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+			if (level == 1 || level == lowest_level + 1) {
+				if (generation == trans->transid) {
+					btrfs_tree_unlock(eb);
+					free_extent_buffer(eb);
+				}
+				break;
+			}
+
+			if (generation != trans->transid) {
+				eb = read_tree_block(root, bytenr, blocksize,
+						generation);
+				btrfs_tree_lock(eb);
+				btrfs_set_lock_blocking(eb);
+			}
+
+			ret = btrfs_cow_block(trans, root, eb, parent, slot,
+					      &eb);
+			BUG_ON(ret);
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				if (!nodes[level - 1]) {
+					nodes[level - 1] = eb->start;
+					memcpy(&node_keys[level - 1], &key,
+					       sizeof(node_keys[0]));
+				} else {
+					WARN_ON(1);
+				}
+			}
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			parent = eb;
+			continue;
+		}
+
+		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					nodes[level - 1],
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1);
+		BUG_ON(ret);
+
+		/*
+		 * If the block was created in the running transaction,
+		 * it's possible this is the last reference to it, so we
+		 * should drop the subtree.
+		 */
+		if (generation == trans->transid) {
+			ret = btrfs_drop_subtree(trans, root, eb, parent);
+			BUG_ON(ret);
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		} else {
+			ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 1);
+			BUG_ON(ret);
+		}
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return 0;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
@@ -1993,6 +2021,9 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
 
+	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+	BUG_ON(ret);
+
 	return ret;
 }
 
@@ -2052,6 +2083,9 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
 
+	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+	BUG_ON(ret);
+
 	return ret;
 }
 
@@ -2071,6 +2105,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	struct extent_buffer *c;
 	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
+	int ret;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
@@ -2082,17 +2117,16 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 		btrfs_node_key(lower, &lower_key, 0);
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-				   root->root_key.objectid, &lower_key,
+				   root->root_key.objectid, trans->transid,
 				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
-	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+	memset_extent_buffer(c, 0, 0, root->nodesize);
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
 	btrfs_set_header_bytenr(c, c->start);
 	btrfs_set_header_generation(c, trans->transid);
-	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
 	write_extent_buffer(c, root->fs_info->fsid,
@@ -2117,6 +2151,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	root->node = c;
 	spin_unlock(&root->node_lock);
 
+	ret = btrfs_update_extent_ref(trans, root, lower->start,
+				      lower->len, lower->start, c->start,
+				      root->root_key.objectid,
+				      trans->transid, level - 1);
+	BUG_ON(ret);
+
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(old);
 
@@ -2193,7 +2233,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
-	} else {
+	} else if (!trans->transaction->delayed_refs.flushing) {
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
@@ -2204,21 +2244,20 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	}
 
 	c_nritems = btrfs_header_nritems(c);
-	mid = (c_nritems + 1) / 2;
-	btrfs_node_key(c, &disk_key, mid);
 
-	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+	split = btrfs_alloc_free_block(trans, root, root->nodesize,
+					path->nodes[level + 1]->start,
 					root->root_key.objectid,
-					&disk_key, level, c->start, 0);
+					trans->transid, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
-	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_flags(split, btrfs_header_flags(c));
 	btrfs_set_header_level(split, btrfs_header_level(c));
 	btrfs_set_header_bytenr(split, split->start);
 	btrfs_set_header_generation(split, trans->transid);
-	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
+	btrfs_set_header_flags(split, 0);
 	write_extent_buffer(split, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(split),
 			    BTRFS_FSID_SIZE);
@@ -2226,6 +2265,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
+	mid = (c_nritems + 1) / 2;
 
 	copy_extent_buffer(split, c,
 			   btrfs_node_key_ptr_offset(0),
@@ -2238,12 +2278,16 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(c);
 	btrfs_mark_buffer_dirty(split);
 
+	btrfs_node_key(split, &disk_key, 0);
 	wret = insert_ptr(trans, root, path, &disk_key, split->start,
 			  path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
 		ret = wret;
 
+	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+	BUG_ON(ret);
+
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		btrfs_tree_unlock(c);
@@ -2316,6 +2360,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
+	int ret;
 
 	if (empty)
 		nr = 0;
@@ -2428,6 +2473,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 		btrfs_mark_buffer_dirty(left);
 	btrfs_mark_buffer_dirty(right);
 
+	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+	BUG_ON(ret);
+
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
 	btrfs_mark_buffer_dirty(upper);
@@ -2672,6 +2720,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 
+	ret = btrfs_update_ref(trans, root, right, left,
+			       old_left_nritems, push_items);
+	BUG_ON(ret);
+
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	if (wret)
@@ -2828,6 +2880,9 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(l);
 	BUG_ON(path->slots[0] != slot);
 
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	BUG_ON(ret);
+
 	if (mid <= slot) {
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
@@ -2856,7 +2911,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			       struct btrfs_path *path, int data_size,
 			       int extend)
 {
-	struct btrfs_disk_key disk_key;
 	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
@@ -2864,11 +2918,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	struct extent_buffer *right;
 	int ret = 0;
 	int wret;
-	int split;
+	int double_split;
 	int num_doubles = 0;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+	    !trans->transaction->delayed_refs.flushing) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0)
 			return wret;
@@ -2890,53 +2945,16 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			return ret;
 	}
 again:
-	split = 1;
+	double_split = 0;
 	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1) / 2;
 
-	if (mid <= slot) {
-		if (nritems == 1 ||
-		    leaf_space_used(l, mid, nritems - mid) + data_size >
-			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (slot >= nritems) {
-				split = 0;
-			} else {
-				mid = slot;
-				if (mid != nritems &&
-				    leaf_space_used(l, mid, nritems - mid) +
-				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-					split = 2;
-				}
-			}
-		}
-	} else {
-		if (leaf_space_used(l, 0, mid) + data_size >
-			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (!extend && data_size && slot == 0) {
-				split = 0;
-			} else if ((extend || !data_size) && slot == 0) {
-				mid = 1;
-			} else {
-				mid = slot;
-				if (mid != nritems &&
-				    leaf_space_used(l, mid, nritems - mid) +
-				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-					split = 2 ;
-				}
-			}
-		}
-	}
-
-	if (split == 0)
-		btrfs_cpu_key_to_disk(&disk_key, ins_key);
-	else
-		btrfs_item_key(l, &disk_key, mid);
-
-	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+	right = btrfs_alloc_free_block(trans, root, root->leafsize,
+					path->nodes[1]->start,
 					root->root_key.objectid,
-					&disk_key, 0, l->start, 0);
+					trans->transid, 0, l->start, 0);
 	if (IS_ERR(right)) {
 		BUG_ON(1);
 		return PTR_ERR(right);
@@ -2945,7 +2963,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(right, right->start);
 	btrfs_set_header_generation(right, trans->transid);
-	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
@@ -2956,47 +2973,79 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
-	if (split == 0) {
-		if (mid <= slot) {
-			btrfs_set_header_nritems(right, 0);
-			wret = insert_ptr(trans, root, path,
-					  &disk_key, right->start,
-					  path->slots[1] + 1, 1);
-			if (wret)
-				ret = wret;
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				struct btrfs_disk_key disk_key;
 
-			btrfs_tree_unlock(path->nodes[0]);
-			free_extent_buffer(path->nodes[0]);
-			path->nodes[0] = right;
-			path->slots[0] = 0;
-			path->slots[1] += 1;
-		} else {
-			btrfs_set_header_nritems(right, 0);
-			wret = insert_ptr(trans, root, path,
-					  &disk_key,
-					  right->start,
-					  path->slots[1], 1);
-			if (wret)
-				ret = wret;
-			btrfs_tree_unlock(path->nodes[0]);
-			free_extent_buffer(path->nodes[0]);
-			path->nodes[0] = right;
-			path->slots[0] = 0;
-			if (path->slots[1] == 0) {
-				wret = fixup_low_keys(trans, root,
-						path, &disk_key, 1);
+				btrfs_cpu_key_to_disk(&disk_key, ins_key);
+				btrfs_set_header_nritems(right, 0);
+				wret = insert_ptr(trans, root, path,
+						  &disk_key, right->start,
+						  path->slots[1] + 1, 1);
 				if (wret)
 					ret = wret;
+
+				btrfs_tree_unlock(path->nodes[0]);
+				free_extent_buffer(path->nodes[0]);
+				path->nodes[0] = right;
+				path->slots[0] = 0;
+				path->slots[1] += 1;
+				btrfs_mark_buffer_dirty(right);
+				return ret;
+			}
+			mid = slot;
+			if (mid != nritems &&
+			    leaf_space_used(l, mid, nritems - mid) +
+			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+				double_split = 1;
+			}
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (!extend && data_size && slot == 0) {
+				struct btrfs_disk_key disk_key;
+
+				btrfs_cpu_key_to_disk(&disk_key, ins_key);
+				btrfs_set_header_nritems(right, 0);
+				wret = insert_ptr(trans, root, path,
+						  &disk_key,
+						  right->start,
+						  path->slots[1], 1);
+				if (wret)
+					ret = wret;
+				btrfs_tree_unlock(path->nodes[0]);
+				free_extent_buffer(path->nodes[0]);
+				path->nodes[0] = right;
+				path->slots[0] = 0;
+				if (path->slots[1] == 0) {
+					wret = fixup_low_keys(trans, root,
+						      path, &disk_key, 1);
+					if (wret)
+						ret = wret;
+				}
+				btrfs_mark_buffer_dirty(right);
+				return ret;
+			} else if ((extend || !data_size) && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					double_split = 1;
+				}
 			}
 		}
-		btrfs_mark_buffer_dirty(right);
-		return ret;
 	}
 
 	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 	BUG_ON(ret);
 
-	if (split == 2) {
+	if (double_split) {
 		BUG_ON(num_doubles != 0);
 		num_doubles++;
 		goto again;
@@ -3398,7 +3447,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 		/* figure out how many keys we can insert in here */
 		total_data = data_size[0];
 		for (i = 1; i < nr; i++) {
-			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
 				break;
 			total_data += data_size[i];
 		}
@@ -3696,7 +3745,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 /*
  * a helper function to delete the leaf pointed to by path->slots[1] and
- * path->nodes[1].
+ * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
  *
  * This deletes the pointer in path->nodes[1] and frees the leaf
  * block extent.  zero is returned if it all worked out, < 0 otherwise.
@@ -3704,14 +3755,15 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * The path must have already been setup for deleting the leaf, including
  * all the proper balancing.  path->nodes[1] must be locked.
  */
-static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path,
-				   struct extent_buffer *leaf)
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr)
 {
 	int ret;
+	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+	u64 parent_start = path->nodes[1]->start;
+	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
 
-	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
 	if (ret)
 		return ret;
@@ -3722,8 +3774,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
-				0, root->root_key.objectid, 0, 0);
+	ret = btrfs_free_extent(trans, root, bytenr,
+				btrfs_level_size(root, 0),
+				parent_start, parent_owner,
+				root_gen, 0, 1);
 	return ret;
 }
 /*
@@ -3791,7 +3845,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
-			ret = btrfs_del_leaf(trans, root, path, leaf);
+			ret = btrfs_del_leaf(trans, root, path, leaf->start);
 			BUG_ON(ret);
 		}
 	} else {
@@ -3807,7 +3861,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+		    !trans->transaction->delayed_refs.flushing) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
@@ -3829,7 +3884,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path, leaf);
+				ret = btrfs_del_leaf(trans, root, path,
+						     leaf->start);
 				BUG_ON(ret);
 				free_extent_buffer(leaf);
 			} else {
diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h
index 03441a99ea38..4414a5d9983a 100644
--- a/trunk/fs/btrfs/ctree.h
+++ b/trunk/fs/btrfs/ctree.h
@@ -45,8 +45,6 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
-#define BTRFS_COMPAT_EXTENT_TREE_V0
-
 /*
  * files bigger than this get some pre-flushing when they are added
  * to the ordered operations list.  That way we limit the total
@@ -269,18 +267,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 
 #define BTRFS_FSID_SIZE 16
-#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
-#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
-#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
-#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
-
-#define BTRFS_BACKREF_REV_MAX		256
-#define BTRFS_BACKREF_REV_SHIFT		56
-#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
-					 BTRFS_BACKREF_REV_SHIFT)
-
-#define BTRFS_OLD_BACKREF_REV		0
-#define BTRFS_MIXED_BACKREF_REV		1
+#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
 
 /*
  * every tree block (leaf or node) starts with this header.
@@ -309,6 +296,7 @@ struct btrfs_header {
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 
 /*
  * this is a very generous portion of the super block, giving us
@@ -367,12 +355,9 @@ struct btrfs_super_block {
  * Compat flags that we support.  If any incompat flags are set other than the
  * ones specified below then we will fail to mount
  */
-#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
-
-#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP		\
-	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
+#define BTRFS_FEATURE_COMPAT_SUPP	0x0
+#define BTRFS_FEATURE_COMPAT_RO_SUPP	0x0
+#define BTRFS_FEATURE_INCOMPAT_SUPP	0x0
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -436,65 +421,23 @@ struct btrfs_path {
 	unsigned int keep_locks:1;
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
-	unsigned int search_commit_root:1;
 };
 
 /*
  * items in the extent btree are used to record the objectid of the
  * owner of the block and the number of references
  */
-
 struct btrfs_extent_item {
-	__le64 refs;
-	__le64 generation;
-	__le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_item_v0 {
 	__le32 refs;
 } __attribute__ ((__packed__));
 
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
-					sizeof(struct btrfs_item))
-
-#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
-#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)
-
-/* following flags only apply to tree blocks */
-
-/* use full backrefs for extent pointers in the block */
-#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)
-
-struct btrfs_tree_block_info {
-	struct btrfs_disk_key key;
-	u8 level;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_data_ref {
-	__le64 root;
-	__le64 objectid;
-	__le64 offset;
-	__le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_shared_data_ref {
-	__le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_inline_ref {
-	u8 type;
-	u64 offset;
-} __attribute__ ((__packed__));
-
-/* old style backrefs item */
-struct btrfs_extent_ref_v0 {
+struct btrfs_extent_ref {
 	__le64 root;
 	__le64 generation;
 	__le64 objectid;
-	__le32 count;
+	__le32 num_refs;
 } __attribute__ ((__packed__));
 
-
 /* dev extents record free space on individual devices.  The owner
  * field points back to the chunk allocation mapping tree that allocated
  * the extent.  The chunk tree uuid field is a way to double check the owner
@@ -752,7 +695,12 @@ struct btrfs_block_group_cache {
 	struct list_head cluster_list;
 };
 
-struct reloc_control;
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct list_head list;
+	spinlock_t lock;
+};
+
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -883,11 +831,18 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
+	/* tree relocation relocated fields */
+	struct list_head dead_reloc_roots;
+	struct btrfs_leaf_ref_tree reloc_ref_tree;
+	struct btrfs_leaf_ref_tree shared_ref_tree;
+
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
+	atomic_t throttles;
+	atomic_t throttle_gen;
 
 	u64 total_pinned;
 
@@ -906,8 +861,6 @@ struct btrfs_fs_info {
 	 */
 	struct list_head space_info;
 
-	struct reloc_control *reloc_ctl;
-
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
@@ -938,6 +891,7 @@ struct btrfs_fs_info {
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
+struct btrfs_dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -945,6 +899,9 @@ struct btrfs_root {
 	spinlock_t node_lock;
 
 	struct extent_buffer *commit_root;
+	struct btrfs_leaf_ref_tree *ref_tree;
+	struct btrfs_leaf_ref_tree ref_tree_struct;
+	struct btrfs_dirty_root *dirty_root;
 	struct btrfs_root *log_root;
 	struct btrfs_root *reloc_root;
 
@@ -995,15 +952,10 @@ struct btrfs_root {
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
-	struct list_head root_list;
-
 	spinlock_t list_lock;
+	struct list_head dead_list;
 	struct list_head orphan_list;
 
-	spinlock_t inode_lock;
-	/* red-black tree that keeps track of in-memory inodes */
-	struct rb_root inode_tree;
-
 	/*
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
@@ -1065,16 +1017,7 @@ struct btrfs_root {
  * are used, and how many references there are to each block
  */
 #define BTRFS_EXTENT_ITEM_KEY	168
-
-#define BTRFS_TREE_BLOCK_REF_KEY	176
-
-#define BTRFS_EXTENT_DATA_REF_KEY	178
-
-#define BTRFS_EXTENT_REF_V0_KEY		180
-
-#define BTRFS_SHARED_BLOCK_REF_KEY	182
-
-#define BTRFS_SHARED_DATA_REF_KEY	184
+#define BTRFS_EXTENT_REF_KEY	180
 
 /*
  * block groups give us hints into the extent allocation trees.  Which
@@ -1100,8 +1043,6 @@ struct btrfs_root {
 #define BTRFS_MOUNT_COMPRESS		(1 << 5)
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
-#define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
-#define BTRFS_MOUNT_NOSSD		(1 << 9)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1115,14 +1056,12 @@ struct btrfs_root {
 #define BTRFS_INODE_READONLY		(1 << 2)
 #define BTRFS_INODE_NOCOMPRESS		(1 << 3)
 #define BTRFS_INODE_PREALLOC		(1 << 4)
-#define BTRFS_INODE_SYNC		(1 << 5)
-#define BTRFS_INODE_IMMUTABLE		(1 << 6)
-#define BTRFS_INODE_APPEND		(1 << 7)
-#define BTRFS_INODE_NODUMP		(1 << 8)
-#define BTRFS_INODE_NOATIME		(1 << 9)
-#define BTRFS_INODE_DIRSYNC		(1 << 10)
-
-
+#define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
+					 ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
+					 BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag)	(BTRFS_I(inode)->flags & \
+					 BTRFS_INODE_##flag)
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -1378,67 +1317,24 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 	return (u8 *)((unsigned long)dev + ptr);
 }
 
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-
-BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
-
-
-BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-
-static inline void btrfs_tree_block_key(struct extent_buffer *eb,
-					struct btrfs_tree_block_info *item,
-					struct btrfs_disk_key *key)
-{
-	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
-					    struct btrfs_tree_block_info *item,
-					    struct btrfs_disk_key *key)
-{
-	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
-		   root, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
-		   objectid, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
-		   offset, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
-		   count, 32);
-
-BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
-		   count, 32);
-
-BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
-		   type, 8);
-BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
-		   offset, 64);
+/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
 
-static inline u32 btrfs_extent_inline_ref_size(int type)
-{
-	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    type == BTRFS_SHARED_BLOCK_REF_KEY)
-		return sizeof(struct btrfs_extent_inline_ref);
-	if (type == BTRFS_SHARED_DATA_REF_KEY)
-		return sizeof(struct btrfs_shared_data_ref) +
-		       sizeof(struct btrfs_extent_inline_ref);
-	if (type == BTRFS_EXTENT_DATA_REF_KEY)
-		return sizeof(struct btrfs_extent_data_ref) +
-		       offsetof(struct btrfs_extent_inline_ref, offset);
-	BUG();
-	return 0;
-}
+BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+			 num_refs, 32);
 
-BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
-BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+			 refs, 32);
 
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1662,21 +1558,6 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
 	return (flags & flag) == flag;
 }
 
-static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
-{
-	u64 flags = btrfs_header_flags(eb);
-	return flags >> BTRFS_BACKREF_REV_SHIFT;
-}
-
-static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
-						int rev)
-{
-	u64 flags = btrfs_header_flags(eb);
-	flags &= ~BTRFS_BACKREF_REV_MASK;
-	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
-	btrfs_set_header_flags(eb, flags);
-}
-
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
 	unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1909,32 +1790,39 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  u64 objectid, u64 offset, u64 bytenr);
+			  struct btrfs_root *root, u64 objectid, u64 bytenr);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root, u32 blocksize,
-					u64 parent, u64 root_objectid,
-					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size);
+					     struct btrfs_root *root,
+					     u32 blocksize, u64 parent,
+					     u64 root_objectid,
+					     u64 ref_generation,
+					     int level,
+					     u64 hint,
+					     u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
 					    int level);
-int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     u64 root_objectid, u64 owner,
-				     u64 offset, struct btrfs_key *ins);
-int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   u64 root_objectid, u64 owner, u64 offset,
-				   struct btrfs_key *ins);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 parent, u64 min_bytes,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1942,18 +1830,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
-int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
-int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data);
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset);
-
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -1961,8 +1849,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset);
-
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner_objectid);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+			    u64 orig_parent, u64 parent,
+			    u64 root_objectid, u64 ref_generation,
+			    u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1974,9 +1867,16 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-				struct btrfs_block_group_cache *group);
-
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1991,12 +1891,13 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
 /* ctree.c */
-int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
-		     int level, int *slot);
-int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *new_key);
@@ -2017,8 +1918,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_root *root,
-			      struct extent_buffer *buf);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -2045,6 +1944,9 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path)
@@ -2103,9 +2005,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 		      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
-int btrfs_set_root_node(struct btrfs_root_item *item,
-			struct extent_buffer *node);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+			  struct btrfs_root *latest_root);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, const char *name,
@@ -2238,6 +2139,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
+void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2245,8 +2147,12 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait);
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root);
+			 struct btrfs_root *root, int *is_new);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2262,8 +2168,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-void btrfs_update_iflags(struct inode *inode);
-void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
@@ -2301,20 +2205,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
-#ifdef CONFIG_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
-#else
-#define btrfs_check_acl NULL
-#endif
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
-/* relocation.c */
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root);
-int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 #endif
diff --git a/trunk/fs/btrfs/delayed-ref.c b/trunk/fs/btrfs/delayed-ref.c
index 84e6781413b1..d6c01c096a40 100644
--- a/trunk/fs/btrfs/delayed-ref.c
+++ b/trunk/fs/btrfs/delayed-ref.c
@@ -29,87 +29,27 @@
  * add extents in the middle of btrfs_search_slot, and it allows
  * us to buffer up frequently modified backrefs in an rb tree instead
  * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
  */
 
 /*
- * compare two delayed tree backrefs with same bytenr and type
- */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
-			  struct btrfs_delayed_tree_ref *ref1)
-{
-	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
-		if (ref1->root < ref2->root)
-			return -1;
-		if (ref1->root > ref2->root)
-			return 1;
-	} else {
-		if (ref1->parent < ref2->parent)
-			return -1;
-		if (ref1->parent > ref2->parent)
-			return 1;
-	}
-	return 0;
-}
-
-/*
- * compare two delayed data backrefs with same bytenr and type
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
  */
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
-			  struct btrfs_delayed_data_ref *ref1)
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+		      u64 bytenr, u64 parent)
 {
-	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
-		if (ref1->root < ref2->root)
-			return -1;
-		if (ref1->root > ref2->root)
-			return 1;
-		if (ref1->objectid < ref2->objectid)
-			return -1;
-		if (ref1->objectid > ref2->objectid)
-			return 1;
-		if (ref1->offset < ref2->offset)
-			return -1;
-		if (ref1->offset > ref2->offset)
-			return 1;
-	} else {
-		if (ref1->parent < ref2->parent)
-			return -1;
-		if (ref1->parent > ref2->parent)
-			return 1;
-	}
-	return 0;
-}
-
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1)
-{
-	if (ref1->bytenr < ref2->bytenr)
+	if (bytenr < ref->bytenr)
 		return -1;
-	if (ref1->bytenr > ref2->bytenr)
+	if (bytenr > ref->bytenr)
 		return 1;
-	if (ref1->is_head && ref2->is_head)
-		return 0;
-	if (ref2->is_head)
+	if (parent < ref->parent)
 		return -1;
-	if (ref1->is_head)
+	if (parent > ref->parent)
 		return 1;
-	if (ref1->type < ref2->type)
-		return -1;
-	if (ref1->type > ref2->type)
-		return 1;
-	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
-		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
-				      btrfs_delayed_node_to_tree_ref(ref1));
-	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
-		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
-		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
-				      btrfs_delayed_node_to_data_ref(ref1));
-	}
-	BUG();
 	return 0;
 }
 
@@ -119,21 +59,20 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
  * inserted.
  */
 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+						  u64 bytenr, u64 parent,
 						  struct rb_node *node)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_node *entry;
-	struct btrfs_delayed_ref_node *ins;
 	int cmp;
 
-	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 	while (*p) {
 		parent_node = *p;
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, ins);
+		cmp = comp_entry(entry, bytenr, parent);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -142,17 +81,18 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 			return entry;
 	}
 
+	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 	rb_link_node(node, parent_node, p);
 	rb_insert_color(node, root);
 	return NULL;
 }
 
 /*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * find an entry based on (bytenr,parent).  This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
  */
-static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
-				  u64 bytenr,
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+				  u64 bytenr, u64 parent,
 				  struct btrfs_delayed_ref_node **last)
 {
 	struct rb_node *n = root->rb_node;
@@ -165,15 +105,7 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 		if (last)
 			*last = entry;
 
-		if (bytenr < entry->bytenr)
-			cmp = -1;
-		else if (bytenr > entry->bytenr)
-			cmp = 1;
-		else if (!btrfs_delayed_ref_is_head(entry))
-			cmp = 1;
-		else
-			cmp = 0;
-
+		cmp = comp_entry(entry, bytenr, parent);
 		if (cmp < 0)
 			n = n->rb_left;
 		else if (cmp > 0)
@@ -222,7 +154,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		find_ref_head(&delayed_refs->root, start, &ref);
+		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
 		if (ref) {
 			struct btrfs_delayed_ref_node *tmp;
 
@@ -302,7 +234,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
 	if (ref) {
 		prev_node = rb_prev(&ref->rb_node);
 		if (!prev_node)
@@ -318,28 +250,25 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
 }
 
 /*
- * helper function to lookup reference count and flags of extent.
+ * helper function to lookup reference count
  *
  * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
+ * reference count modifications queued up in the rbtree.  This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
  */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags)
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
 {
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_path *path;
-	struct btrfs_extent_item *ei;
 	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
-	u32 item_size;
-	u64 num_refs;
-	u64 extent_flags;
+	u32 num_refs;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -358,60 +287,37 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 
 	if (ret == 0) {
 		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-		if (item_size >= sizeof(*ei)) {
-			ei = btrfs_item_ptr(leaf, path->slots[0],
-					    struct btrfs_extent_item);
-			num_refs = btrfs_extent_refs(leaf, ei);
-			extent_flags = btrfs_extent_flags(leaf, ei);
-		} else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-			struct btrfs_extent_item_v0 *ei0;
-			BUG_ON(item_size != sizeof(*ei0));
-			ei0 = btrfs_item_ptr(leaf, path->slots[0],
-					     struct btrfs_extent_item_v0);
-			num_refs = btrfs_extent_refs_v0(leaf, ei0);
-			/* FIXME: this isn't correct for data */
-			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
-			BUG();
-#endif
-		}
-		BUG_ON(num_refs == 0);
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_extent_item);
+		num_refs = btrfs_extent_refs(leaf, ei);
 	} else {
 		num_refs = 0;
-		extent_flags = 0;
 		ret = 0;
 	}
 
 	spin_lock(&delayed_refs->lock);
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
 	if (ref) {
 		head = btrfs_delayed_node_to_head(ref);
-		if (!mutex_trylock(&head->mutex)) {
-			atomic_inc(&ref->refs);
-			spin_unlock(&delayed_refs->lock);
-
-			btrfs_release_path(root->fs_info->extent_root, path);
-
-			mutex_lock(&head->mutex);
+		if (mutex_trylock(&head->mutex)) {
+			num_refs += ref->ref_mod;
 			mutex_unlock(&head->mutex);
-			btrfs_put_delayed_ref(ref);
-			goto again;
+			*refs = num_refs;
+			goto out;
 		}
-		if (head->extent_op && head->extent_op->update_flags)
-			extent_flags |= head->extent_op->flags_to_set;
-		else
-			BUG_ON(num_refs == 0);
 
-		num_refs += ref->ref_mod;
+		atomic_inc(&ref->refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(root->fs_info->extent_root, path);
+
+		mutex_lock(&head->mutex);
 		mutex_unlock(&head->mutex);
-	}
-	WARN_ON(num_refs == 0);
-	if (refs)
+		btrfs_put_delayed_ref(ref);
+		goto again;
+	} else {
 		*refs = num_refs;
-	if (flags)
-		*flags = extent_flags;
+	}
 out:
 	spin_unlock(&delayed_refs->lock);
 	btrfs_free_path(path);
@@ -432,7 +338,16 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 		    struct btrfs_delayed_ref_node *existing,
 		    struct btrfs_delayed_ref_node *update)
 {
-	if (update->action != existing->action) {
+	struct btrfs_delayed_ref *existing_ref;
+	struct btrfs_delayed_ref *ref;
+
+	existing_ref = btrfs_delayed_node_to_ref(existing);
+	ref = btrfs_delayed_node_to_ref(update);
+
+	if (ref->pin)
+		existing_ref->pin = 1;
+
+	if (ref->action != existing_ref->action) {
 		/*
 		 * this is effectively undoing either an add or a
 		 * drop.  We decrement the ref_mod, and if it goes
@@ -448,13 +363,20 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 			delayed_refs->num_entries--;
 			if (trans->delayed_ref_updates)
 				trans->delayed_ref_updates--;
-		} else {
-			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		}
 	} else {
-		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+			/* if we're adding refs, make sure all the
+			 * details match up.  The extent could
+			 * have been totally freed and reallocated
+			 * by a different owner before the delayed
+			 * ref entries were removed.
+			 */
+			existing_ref->owner_objectid = ref->owner_objectid;
+			existing_ref->generation = ref->generation;
+			existing_ref->root = ref->root;
+			existing->num_bytes = update->num_bytes;
+		}
 		/*
 		 * the action on the existing ref matches
 		 * the action on the ref we're trying to add.
@@ -479,7 +401,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 
 	existing_ref = btrfs_delayed_node_to_head(existing);
 	ref = btrfs_delayed_node_to_head(update);
-	BUG_ON(existing_ref->is_data != ref->is_data);
 
 	if (ref->must_insert_reserved) {
 		/* if the extent was freed and then
@@ -499,24 +420,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 
 	}
 
-	if (ref->extent_op) {
-		if (!existing_ref->extent_op) {
-			existing_ref->extent_op = ref->extent_op;
-		} else {
-			if (ref->extent_op->update_key) {
-				memcpy(&existing_ref->extent_op->key,
-				       &ref->extent_op->key,
-				       sizeof(ref->extent_op->key));
-				existing_ref->extent_op->update_key = 1;
-			}
-			if (ref->extent_op->update_flags) {
-				existing_ref->extent_op->flags_to_set |=
-					ref->extent_op->flags_to_set;
-				existing_ref->extent_op->update_flags = 1;
-			}
-			kfree(ref->extent_op);
-		}
-	}
 	/*
 	 * update the reference mod on the head to reflect this new operation
 	 */
@@ -524,16 +427,19 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 }
 
 /*
- * helper function to actually insert a head node into the rbtree.
+ * helper function to actually insert a delayed ref into the rbtree.
  * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
-					struct btrfs_delayed_ref_node *ref,
-					u64 bytenr, u64 num_bytes,
-					int action, int is_data)
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  struct btrfs_delayed_ref_node *ref,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin)
 {
 	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_ref *full_ref;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
@@ -543,10 +449,12 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 	 * the head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
 	 */
-	if (action == BTRFS_UPDATE_DELAYED_HEAD)
-		count_mod = 0;
-	else if (action == BTRFS_DROP_DELAYED_REF)
-		count_mod = -1;
+	if (parent == (u64)-1) {
+		if (action == BTRFS_DROP_DELAYED_REF)
+			count_mod = -1;
+		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+			count_mod = 0;
+	}
 
 	/*
 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -559,148 +467,57 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 	 * Once we record must_insert_reserved, switch the action to
 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
 	 */
-	if (action == BTRFS_ADD_DELAYED_EXTENT)
+	if (action == BTRFS_ADD_DELAYED_EXTENT) {
 		must_insert_reserved = 1;
-	else
-		must_insert_reserved = 0;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-
-	/* first set the basic ref node struct up */
-	atomic_set(&ref->refs, 1);
-	ref->bytenr = bytenr;
-	ref->num_bytes = num_bytes;
-	ref->ref_mod = count_mod;
-	ref->type  = 0;
-	ref->action  = 0;
-	ref->is_head = 1;
-	ref->in_tree = 1;
-
-	head_ref = btrfs_delayed_node_to_head(ref);
-	head_ref->must_insert_reserved = must_insert_reserved;
-	head_ref->is_data = is_data;
-
-	INIT_LIST_HEAD(&head_ref->cluster);
-	mutex_init(&head_ref->mutex);
-
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
-	if (existing) {
-		update_existing_head_ref(existing, ref);
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
-		kfree(ref);
-	} else {
-		delayed_refs->num_heads++;
-		delayed_refs->num_heads_ready++;
-		delayed_refs->num_entries++;
-		trans->delayed_ref_updates++;
-	}
-	return 0;
-}
-
-/*
- * helper to insert a delayed tree ref into the rbtree.
- */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_delayed_ref_node *ref,
-					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action)
-{
-	struct btrfs_delayed_ref_node *existing;
-	struct btrfs_delayed_tree_ref *full_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-
-	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-
-	/* first set the basic ref node struct up */
-	atomic_set(&ref->refs, 1);
-	ref->bytenr = bytenr;
-	ref->num_bytes = num_bytes;
-	ref->ref_mod = 1;
-	ref->action = action;
-	ref->is_head = 0;
-	ref->in_tree = 1;
-
-	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
-		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
 	} else {
-		full_ref->root = ref_root;
-		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-	}
-	full_ref->level = level;
-
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
-	if (existing) {
-		update_existing_ref(trans, delayed_refs, existing, ref);
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
-		kfree(ref);
-	} else {
-		delayed_refs->num_entries++;
-		trans->delayed_ref_updates++;
+		must_insert_reserved = 0;
 	}
-	return 0;
-}
 
-/*
- * helper to insert a delayed data ref into the rbtree.
- */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_delayed_ref_node *ref,
-					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, u64 owner, u64 offset,
-					 int action)
-{
-	struct btrfs_delayed_ref_node *existing;
-	struct btrfs_delayed_data_ref *full_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-
-	if (action == BTRFS_ADD_DELAYED_EXTENT)
-		action = BTRFS_ADD_DELAYED_REF;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
 	/* first set the basic ref node struct up */
 	atomic_set(&ref->refs, 1);
 	ref->bytenr = bytenr;
-	ref->num_bytes = num_bytes;
-	ref->ref_mod = 1;
-	ref->action = action;
-	ref->is_head = 0;
+	ref->parent = parent;
+	ref->ref_mod = count_mod;
 	ref->in_tree = 1;
+	ref->num_bytes = num_bytes;
 
-	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
-		ref->type = BTRFS_SHARED_DATA_REF_KEY;
+	if (btrfs_delayed_ref_is_head(ref)) {
+		head_ref = btrfs_delayed_node_to_head(ref);
+		head_ref->must_insert_reserved = must_insert_reserved;
+		INIT_LIST_HEAD(&head_ref->cluster);
+		mutex_init(&head_ref->mutex);
 	} else {
+		full_ref = btrfs_delayed_node_to_ref(ref);
 		full_ref->root = ref_root;
-		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+		full_ref->generation = ref_generation;
+		full_ref->owner_objectid = owner_objectid;
+		full_ref->pin = pin;
+		full_ref->action = action;
 	}
-	full_ref->objectid = owner;
-	full_ref->offset = offset;
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+	existing = tree_insert(&delayed_refs->root, bytenr,
+			       parent, &ref->rb_node);
 
 	if (existing) {
-		update_existing_ref(trans, delayed_refs, existing, ref);
+		if (btrfs_delayed_ref_is_head(ref))
+			update_existing_head_ref(existing, ref);
+		else
+			update_existing_ref(trans, delayed_refs, existing, ref);
+
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
 		kfree(ref);
 	} else {
+		if (btrfs_delayed_ref_is_head(ref)) {
+			delayed_refs->num_heads++;
+			delayed_refs->num_heads_ready++;
+		}
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
@@ -708,78 +525,37 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
 }
 
 /*
- * add a delayed tree ref.  This does all of the accounting required
+ * add a delayed ref to the tree.  This does all of the accounting required
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes, u64 parent,
-			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin)
 {
-	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_delayed_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int ret;
 
-	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmalloc(sizeof(*ref), GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-	if (!head_ref) {
-		kfree(ref);
-		return -ENOMEM;
-	}
-
-	head_ref->extent_op = extent_op;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-
 	/*
-	 * insert both the head node and the new ref without dropping
-	 * the spin lock
+	 * the parent = 0 case comes from cases where we don't actually
+	 * know the parent yet.  It will get updated later via a add/drop
+	 * pair.
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 0);
-	BUG_ON(ret);
-
-	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, level, action);
-	BUG_ON(ret);
-	spin_unlock(&delayed_refs->lock);
-	return 0;
-}
-
-/*
- * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
- */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes,
-			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
-{
-	struct btrfs_delayed_data_ref *ref;
-	struct btrfs_delayed_ref_head *head_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
-
-	BUG_ON(extent_op && !extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
+	if (parent == 0)
+		parent = bytenr;
 
 	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
 	if (!head_ref) {
 		kfree(ref);
 		return -ENOMEM;
 	}
-
-	head_ref->extent_op = extent_op;
-
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
@@ -787,39 +563,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 1);
-	BUG_ON(ret);
-
-	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, owner, offset, action);
+	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+				      (u64)-1, 0, 0, 0, action, pin);
 	BUG_ON(ret);
-	spin_unlock(&delayed_refs->lock);
-	return 0;
-}
-
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
-				u64 bytenr, u64 num_bytes,
-				struct btrfs_delayed_extent_op *extent_op)
-{
-	struct btrfs_delayed_ref_head *head_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
-
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-	if (!head_ref)
-		return -ENOMEM;
-
-	head_ref->extent_op = extent_op;
 
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
-				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
-				   extent_op->is_data);
+	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+				      parent, ref_root, ref_generation,
+				      owner_objectid, action, pin);
 	BUG_ON(ret);
-
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -836,7 +587,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
@@ -852,7 +603,6 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
  *
  * It is the same as doing a ref add and delete in two separate calls.
  */
-#if 0
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -916,4 +666,3 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
-#endif
diff --git a/trunk/fs/btrfs/delayed-ref.h b/trunk/fs/btrfs/delayed-ref.h
index f6fc67ddad36..3bec2ff0b15c 100644
--- a/trunk/fs/btrfs/delayed-ref.h
+++ b/trunk/fs/btrfs/delayed-ref.h
@@ -30,6 +30,9 @@ struct btrfs_delayed_ref_node {
 	/* the starting bytenr of the extent */
 	u64 bytenr;
 
+	/* the parent our backref will point to */
+	u64 parent;
+
 	/* the size of the extent */
 	u64 num_bytes;
 
@@ -47,21 +50,10 @@ struct btrfs_delayed_ref_node {
 	 */
 	int ref_mod;
 
-	unsigned int action:8;
-	unsigned int type:8;
 	/* is this node still in the rbtree? */
-	unsigned int is_head:1;
 	unsigned int in_tree:1;
 };
 
-struct btrfs_delayed_extent_op {
-	struct btrfs_disk_key key;
-	u64 flags_to_set;
-	unsigned int update_key:1;
-	unsigned int update_flags:1;
-	unsigned int is_data:1;
-};
-
 /*
  * the head refs are used to hold a lock on a given extent, which allows us
  * to make sure that only one process is running the delayed refs
@@ -79,7 +71,6 @@ struct btrfs_delayed_ref_head {
 
 	struct list_head cluster;
 
-	struct btrfs_delayed_extent_op *extent_op;
 	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
@@ -93,26 +84,27 @@ struct btrfs_delayed_ref_head {
 	 * the free has happened.
 	 */
 	unsigned int must_insert_reserved:1;
-	unsigned int is_data:1;
 };
 
-struct btrfs_delayed_tree_ref {
+struct btrfs_delayed_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
-	int level;
-};
 
-struct btrfs_delayed_data_ref {
-	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
-	u64 objectid;
-	u64 offset;
+	/* the root objectid our ref will point to */
+	u64 root;
+
+	/* the generation for the backref */
+	u64 generation;
+
+	/* owner_objectid of the backref  */
+	u64 owner_objectid;
+
+	/* operation done by this entry in the rbtree */
+	u8 action;
+
+	/* if pin == 1, when the extent is freed it will be pinned until
+	 * transaction commit
+	 */
+	unsigned int pin:1;
 };
 
 struct btrfs_delayed_ref_root {
@@ -151,25 +143,17 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 	}
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes, u64 parent,
-			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes,
-			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
-				u64 bytenr, u64 num_bytes,
-				struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -185,24 +169,18 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
  */
 static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
 {
-	return node->is_head;
+	return node->parent == (u64)-1;
 }
 
 /*
  * helper functions to cast a node into its container
  */
-static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
 {
 	WARN_ON(btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_tree_ref, node);
-}
+	return container_of(node, struct btrfs_delayed_ref, node);
 
-static inline struct btrfs_delayed_data_ref *
-btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
-{
-	WARN_ON(btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_data_ref, node);
 }
 
 static inline struct btrfs_delayed_ref_head *
@@ -210,5 +188,6 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
 {
 	WARN_ON(!btrfs_delayed_ref_is_head(node));
 	return container_of(node, struct btrfs_delayed_ref_head, node);
+
 }
 #endif
diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c
index 0d50d49d990a..4b0ea0b80c23 100644
--- a/trunk/fs/btrfs/disk-io.c
+++ b/trunk/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/crc32c.h>
 #include "compat.h"
+#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -36,6 +36,7 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
+#include "ref-cache.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
 
@@ -171,7 +172,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-	return crc32c(seed, data, len);
+	return btrfs_crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, char *result)
@@ -883,6 +884,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
 	root->node = NULL;
 	root->commit_root = NULL;
+	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -897,14 +899,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->last_inode_alloc = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
-	root->inode_tree.rb_node = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
-	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD(&root->dead_list);
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->list_lock);
-	spin_lock_init(&root->inode_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
@@ -918,6 +918,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
+	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+	root->ref_tree = &root->ref_tree_struct;
+
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -956,7 +959,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
-	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -1023,19 +1025,20 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	 */
 	root->ref_cows = 0;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, BTRFS_TREE_LOG_OBJECTID,
+				      trans->transid, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
 	}
 
-	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
-	btrfs_set_header_bytenr(leaf, leaf->start);
-	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
-	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
 	root->node = leaf;
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_bytenr(root->node, root->node->start);
+	btrfs_set_header_generation(root->node, trans->transid);
+	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(root->node),
@@ -1078,7 +1081,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
-	btrfs_set_root_node(&log_root->root_item, log_root->node);
+	btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+	btrfs_set_root_generation(&log_root->root_item, trans->transid);
 
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
@@ -1140,7 +1144,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
-	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
 insert:
 	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1207,7 +1210,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	}
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_find_dead_roots(fs_info->tree_root,
-					    root->root_key.objectid);
+					    root->root_key.objectid, root);
 		BUG_ON(ret);
 		btrfs_orphan_cleanup(root);
 	}
@@ -1566,6 +1569,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
+	atomic_set(&fs_info->throttles, 0);
+	atomic_set(&fs_info->throttle_gen, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -1593,7 +1598,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
-	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
@@ -1609,6 +1613,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
+	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
+	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
+
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -1666,12 +1674,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_iput;
 	}
 
-	features = btrfs_super_incompat_flags(disk_super);
-	if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
-		features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-		btrfs_set_super_incompat_flags(disk_super, features);
-	}
-
 	features = btrfs_super_compat_ro_flags(disk_super) &
 		~BTRFS_FEATURE_COMPAT_RO_SUPP;
 	if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1769,7 +1771,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read the system "
 		       "array on %s\n", sb->s_id);
-		goto fail_sb_buffer;
+		goto fail_sys_array;
 	}
 
 	blocksize = btrfs_level_size(tree_root,
@@ -1783,8 +1785,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
 	BUG_ON(!chunk_root->node);
-	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
-	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
 	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,8 +1810,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					  blocksize, generation);
 	if (!tree_root->node)
 		goto fail_chunk_root;
-	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
-	tree_root->commit_root = btrfs_root_node(tree_root);
+
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1821,14 +1820,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
+	dev_root->track_dirty = 1;
 	if (ret)
 		goto fail_extent_root;
-	dev_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-		goto fail_dev_root;
+		goto fail_extent_root;
 
 	csum_root->track_dirty = 1;
 
@@ -1850,14 +1849,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
-	if (!btrfs_test_opt(tree_root, SSD) &&
-	    !btrfs_test_opt(tree_root, NOSSD) &&
-	    !fs_info->fs_devices->rotating) {
-		printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
-		       "mode\n");
-		btrfs_set_opt(fs_info->mount_opt, SSD);
-	}
-
 	if (btrfs_super_log_root(disk_super) != 0) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
@@ -1890,7 +1881,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		ret = btrfs_recover_relocation(tree_root);
+		ret = btrfs_cleanup_reloc_trees(tree_root);
 		BUG_ON(ret);
 	}
 
@@ -1901,7 +1892,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
 		goto fail_trans_kthread;
-
 	return tree_root;
 
 fail_trans_kthread:
@@ -1918,19 +1908,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 fail_csum_root:
 	free_extent_buffer(csum_root->node);
-	free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
-	free_extent_buffer(dev_root->node);
-	free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
-	free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
-	free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
 	free_extent_buffer(chunk_root->node);
-	free_extent_buffer(chunk_root->commit_root);
+fail_sys_array:
+	free_extent_buffer(dev_root->node);
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2020,17 +2005,6 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	return latest;
 }
 
-/*
- * this should be called twice, once with wait == 0 and
- * once with wait == 1.  When wait == 0 is done, all the buffer heads
- * we write are pinned.
- *
- * They are released when wait == 1 is done.
- * max_mirrors must be the same for both runs, and it indicates how
- * many supers on this one device should be written.
- *
- * max_mirrors == 0 means to write them all.
- */
 static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb,
 			    int do_barriers, int wait, int max_mirrors)
@@ -2066,16 +2040,12 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh = __find_get_block(device->bdev, bytenr / 4096,
 					      BTRFS_SUPER_INFO_SIZE);
 			BUG_ON(!bh);
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				errors++;
-
-			/* drop our reference */
 			brelse(bh);
-
-			/* drop the reference from the wait == 0 run */
-			brelse(bh);
-			continue;
+			wait_on_buffer(bh);
+			if (buffer_uptodate(bh)) {
+				brelse(bh);
+				continue;
+			}
 		} else {
 			btrfs_set_super_bytenr(sb, bytenr);
 
@@ -2086,18 +2056,12 @@ static int write_dev_supers(struct btrfs_device *device,
 					      BTRFS_CSUM_SIZE);
 			btrfs_csum_final(crc, sb->csum);
 
-			/*
-			 * one reference for us, and we leave it for the
-			 * caller
-			 */
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
 			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
-			/* one reference for submit_bh */
-			get_bh(bh);
-
 			set_buffer_uptodate(bh);
+			get_bh(bh);
 			lock_buffer(bh);
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
@@ -2109,7 +2073,6 @@ static int write_dev_supers(struct btrfs_device *device,
 				       device->name);
 				set_buffer_uptodate(bh);
 				device->barriers = 0;
-				/* one reference for submit_bh */
 				get_bh(bh);
 				lock_buffer(bh);
 				ret = submit_bh(WRITE_SYNC, bh);
@@ -2118,15 +2081,22 @@ static int write_dev_supers(struct btrfs_device *device,
 			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
-		if (ret)
+		if (!ret && wait) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+		} else if (ret) {
 			errors++;
+		}
+		if (wait)
+			brelse(bh);
 	}
 	return errors < i ? 0 : -1;
 }
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-	struct list_head *head;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
@@ -2141,9 +2111,6 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	sb = &root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
-
-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	head = &root->fs_info->fs_devices->devices;
 	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
@@ -2187,7 +2154,6 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		if (ret)
 			total_errors++;
 	}
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
 		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
 		       total_errors);
@@ -2207,7 +2173,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
-	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
 	if (root->anon_super.s_dev) {
@@ -2254,12 +2219,10 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 					     ARRAY_SIZE(gang));
 		if (!ret)
 			break;
-
-		root_objectid = gang[ret - 1]->root_key.objectid + 1;
 		for (i = 0; i < ret; i++) {
 			root_objectid = gang[i]->root_key.objectid;
 			ret = btrfs_find_dead_roots(fs_info->tree_root,
-						    root_objectid);
+						    root_objectid, gang[i]);
 			BUG_ON(ret);
 			btrfs_orphan_cleanup(gang[i]);
 		}
@@ -2315,16 +2278,20 @@ int close_ctree(struct btrfs_root *root)
 		       (unsigned long long)fs_info->total_ref_cache_size);
 	}
 
-	free_extent_buffer(fs_info->extent_root->node);
-	free_extent_buffer(fs_info->extent_root->commit_root);
-	free_extent_buffer(fs_info->tree_root->node);
-	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	free_extent_buffer(root->fs_info->dev_root->node);
-	free_extent_buffer(root->fs_info->dev_root->commit_root);
-	free_extent_buffer(root->fs_info->csum_root->node);
-	free_extent_buffer(root->fs_info->csum_root->commit_root);
+	if (fs_info->extent_root->node)
+		free_extent_buffer(fs_info->extent_root->node);
+
+	if (fs_info->tree_root->node)
+		free_extent_buffer(fs_info->tree_root->node);
+
+	if (root->fs_info->chunk_root->node)
+		free_extent_buffer(root->fs_info->chunk_root->node);
+
+	if (root->fs_info->dev_root->node)
+		free_extent_buffer(root->fs_info->dev_root->node);
+
+	if (root->fs_info->csum_root->node)
+		free_extent_buffer(root->fs_info->csum_root->node);
 
 	btrfs_free_block_groups(root->fs_info);
 
@@ -2406,14 +2373,17 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
+	struct extent_io_tree *tree;
 	u64 num_dirty;
+	u64 start = 0;
 	unsigned long thresh = 32 * 1024 * 1024;
+	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
 
-	num_dirty = root->fs_info->dirty_metadata_bytes;
-
+	num_dirty = count_range_bits(tree, &start, (u64)-1,
+				     thresh, EXTENT_DIRTY);
 	if (num_dirty > thresh) {
 		balance_dirty_pages_ratelimited_nr(
 				   root->fs_info->btree_inode->i_mapping, 1);
diff --git a/trunk/fs/btrfs/export.c b/trunk/fs/btrfs/export.c
index 9596b40caa4e..85315d2c90de 100644
--- a/trunk/fs/btrfs/export.c
+++ b/trunk/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	inode = btrfs_iget(sb, &key, root);
+	inode = btrfs_iget(sb, &key, root, NULL);
 	if (IS_ERR(inode))
 		return (void *)inode;
 
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
 }
 
 const struct export_operations btrfs_export_ops = {
diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c
index edc7d208c5ce..35af93355063 100644
--- a/trunk/fs/btrfs/extent-tree.c
+++ b/trunk/fs/btrfs/extent-tree.c
@@ -23,39 +23,50 @@
 #include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
+#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ref-cache.h"
 #include "free-space-cache.h"
 
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+
+struct pending_extent_op {
+	int type;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
+	u64 orig_parent;
+	u64 generation;
+	u64 orig_generation;
+	int level;
+	struct list_head list;
+	int del;
+};
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root, u64 parent,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, struct btrfs_key *ins,
+					 int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
 				   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 parent,
-				u64 root_objectid, u64 owner_objectid,
-				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extra_op);
-static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
-				    struct extent_buffer *leaf,
-				    struct btrfs_extent_item *ei);
-static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      u64 parent, u64 root_objectid,
-				      u64 flags, u64 owner, u64 offset,
-				      struct btrfs_key *ins, int ref_mod);
-static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     u64 parent, u64 root_objectid,
-				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes, u64 parent,
+					u64 root_objectid, u64 ref_generation,
+					u64 owner_objectid, int pin,
+					int ref_to_drop);
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -442,1071 +453,348 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  *    maintenance.  This is actually the same as #2, but with a slightly
  *    different use case.
  *
- * There are two kinds of back refs. The implicit back refs is optimized
- * for pointers in non-shared tree blocks. For a given pointer in a block,
- * back refs of this kind provide information about the block's owner tree
- * and the pointer's key. These information allow us to find the block by
- * b-tree searching. The full back refs is for pointers in tree blocks not
- * referenced by their owner trees. The location of tree block is recorded
- * in the back refs. Actually the full back refs is generic, and can be
- * used in all cases the implicit back refs is used. The major shortcoming
- * of the full back refs is its overhead. Every time a tree block gets
- * COWed, we have to update back refs entry for all pointers in it.
- *
- * For a newly allocated tree block, we use implicit back refs for
- * pointers in it. This means most tree related operations only involve
- * implicit back refs. For a tree block created in old transaction, the
- * only way to drop a reference to it is COW it. So we can detect the
- * event that tree block loses its owner tree's reference and do the
- * back refs conversion.
- *
- * When a tree block is COW'd through a tree, there are four cases:
- *
- * The reference count of the block is one and the tree is the block's
- * owner tree. Nothing to do in this case.
- *
- * The reference count of the block is one and the tree is not the
- * block's owner tree. In this case, full back refs is used for pointers
- * in the block. Remove these full back refs, add implicit back refs for
- * every pointers in the new block.
- *
- * The reference count of the block is greater than one and the tree is
- * the block's owner tree. In this case, implicit back refs is used for
- * pointers in the block. Add full back refs for every pointers in the
- * block, increase lower level extents' reference counts. The original
- * implicit back refs are entailed to the new block.
- *
- * The reference count of the block is greater than one and the tree is
- * not the block's owner tree. Add implicit back refs for every pointer in
- * the new block, increase lower level extents' reference count.
- *
- * Back Reference Key composing:
- *
- * The key objectid corresponds to the first byte in the extent,
- * The key type is used to differentiate between types of back refs.
- * There are different meanings of the key offset for different types
- * of back refs.
- *
  * File extents can be referenced by:
  *
  * - multiple snapshots, subvolumes, or different generations in one subvol
  * - different files inside a single subvolume
  * - different offsets inside a file (bookend extents in file.c)
  *
- * The extent ref structure for the implicit back refs has fields for:
+ * The extent ref structure has fields for:
  *
  * - Objectid of the subvolume root
+ * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
- * - original offset in the file
- * - how many bookend extents
- *
- * The key offset for the implicit back refs is hash of the first
- * three fields.
- *
- * The extent ref structure for the full back refs has field for:
+ * - number of references holding by parent node (alway 1 for tree blocks)
  *
- * - number of pointers in the tree leaf
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together.
  *
- * The key offset for the implicit back refs is the first byte of
- * the tree leaf
+ * When a file extent is allocated the fields are filled in:
+ *     (root_key.objectid, trans->transid, inode objectid, 1)
  *
- * When a file extent is allocated, The implicit back refs is used.
- * the fields are filled in:
+ * When a leaf is cow'd new references are added for every file extent found
+ * in the leaf.  It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
  *
- *     (root_key.objectid, inode objectid, offset in file, 1)
+ *     (root_key.objectid, trans->transid, inode objectid,
+ *      number of references in the leaf)
  *
- * When a file extent is removed file truncation, we find the
- * corresponding implicit back refs and check the following fields:
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
  *
- *     (btrfs_header_owner(leaf), inode objectid, offset in file)
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid)
  *
  * Btree extents can be referenced by:
  *
  * - Different subvolumes
+ * - Different generations of the same subvolume
  *
- * Both the implicit back refs and the full back refs for tree blocks
- * only consist of key. The key offset for the implicit back refs is
- * objectid of block's owner tree. The key offset for the full back refs
- * is the first byte of parent block.
+ * When a tree block is created, back references are inserted:
  *
- * When implicit back refs is used, information about the lowest key and
- * level of the tree block are required. These information are stored in
- * tree block info structure.
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a backref is in deleting, the following fields are checked:
+ *
+ * if backref was for a tree root:
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
+ * else
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
  */
 
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct btrfs_path *path,
-				  u64 owner, u32 extra_size)
-{
-	struct btrfs_extent_item *item;
-	struct btrfs_extent_item_v0 *ei0;
-	struct btrfs_extent_ref_v0 *ref0;
-	struct btrfs_tree_block_info *bi;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	u32 new_size = sizeof(*item);
-	u64 refs;
-	int ret;
-
-	leaf = path->nodes[0];
-	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
-
-	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-	ei0 = btrfs_item_ptr(leaf, path->slots[0],
-			     struct btrfs_extent_item_v0);
-	refs = btrfs_extent_refs_v0(leaf, ei0);
-
-	if (owner == (u64)-1) {
-		while (1) {
-			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret < 0)
-					return ret;
-				BUG_ON(ret > 0);
-				leaf = path->nodes[0];
-			}
-			btrfs_item_key_to_cpu(leaf, &found_key,
-					      path->slots[0]);
-			BUG_ON(key.objectid != found_key.objectid);
-			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
-				path->slots[0]++;
-				continue;
-			}
-			ref0 = btrfs_item_ptr(leaf, path->slots[0],
-					      struct btrfs_extent_ref_v0);
-			owner = btrfs_ref_objectid_v0(leaf, ref0);
-			break;
-		}
-	}
-	btrfs_release_path(root, path);
-
-	if (owner < BTRFS_FIRST_FREE_OBJECTID)
-		new_size += sizeof(*bi);
-
-	new_size -= sizeof(*ei0);
-	ret = btrfs_search_slot(trans, root, &key, path,
-				new_size + extra_size, 1);
-	if (ret < 0)
-		return ret;
-	BUG_ON(ret);
-
-	ret = btrfs_extend_item(trans, root, path, new_size);
-	BUG_ON(ret);
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	btrfs_set_extent_refs(leaf, item, refs);
-	/* FIXME: get real generation */
-	btrfs_set_extent_generation(leaf, item, 0);
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		btrfs_set_extent_flags(leaf, item,
-				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
-				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
-		bi = (struct btrfs_tree_block_info *)(item + 1);
-		/* FIXME: get first key of the block */
-		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
-		btrfs_set_tree_block_level(leaf, bi, (int)owner);
-	} else {
-		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
-	}
-	btrfs_mark_buffer_dirty(leaf);
-	return 0;
-}
-#endif
-
-static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
-{
-	u32 high_crc = ~(u32)0;
-	u32 low_crc = ~(u32)0;
-	__le64 lenum;
-
-	lenum = cpu_to_le64(root_objectid);
-	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
-	lenum = cpu_to_le64(owner);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-	lenum = cpu_to_le64(offset);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-
-	return ((u64)high_crc << 31) ^ (u64)low_crc;
-}
-
-static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
-				     struct btrfs_extent_data_ref *ref)
-{
-	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
-				    btrfs_extent_data_ref_objectid(leaf, ref),
-				    btrfs_extent_data_ref_offset(leaf, ref));
-}
-
-static int match_extent_data_ref(struct extent_buffer *leaf,
-				 struct btrfs_extent_data_ref *ref,
-				 u64 root_objectid, u64 owner, u64 offset)
-{
-	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
-	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
-	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
-		return 0;
-	return 1;
-}
-
-static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct btrfs_path *path,
-					   u64 bytenr, u64 parent,
-					   u64 root_objectid,
-					   u64 owner, u64 offset)
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid, int del)
 {
 	struct btrfs_key key;
-	struct btrfs_extent_data_ref *ref;
+	struct btrfs_extent_ref *ref;
 	struct extent_buffer *leaf;
-	u32 nritems;
+	u64 ref_objectid;
 	int ret;
-	int recow;
-	int err = -ENOENT;
 
 	key.objectid = bytenr;
-	if (parent) {
-		key.type = BTRFS_SHARED_DATA_REF_KEY;
-		key.offset = parent;
-	} else {
-		key.type = BTRFS_EXTENT_DATA_REF_KEY;
-		key.offset = hash_extent_data_ref(root_objectid,
-						  owner, offset);
-	}
-again:
-	recow = 0;
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
-		err = ret;
-		goto fail;
-	}
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
 
-	if (parent) {
-		if (!ret)
-			return 0;
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		key.type = BTRFS_EXTENT_REF_V0_KEY;
-		btrfs_release_path(root, path);
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0) {
-			err = ret;
-			goto fail;
-		}
-		if (!ret)
-			return 0;
-#endif
-		goto fail;
+	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
 	}
 
 	leaf = path->nodes[0];
-	nritems = btrfs_header_nritems(leaf);
-	while (1) {
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				err = ret;
-			if (ret)
-				goto fail;
-
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-			recow = 1;
-		}
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid != bytenr ||
-		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
-			goto fail;
-
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_data_ref);
-
-		if (match_extent_data_ref(leaf, ref, root_objectid,
-					  owner, offset)) {
-			if (recow) {
-				btrfs_release_path(root, path);
-				goto again;
-			}
-			err = 0;
-			break;
-		}
-		path->slots[0]++;
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != ref_generation ||
+	    (ref_objectid != owner_objectid &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+		ret = -EIO;
+		WARN_ON(1);
+		goto out;
 	}
-fail:
-	return err;
+	ret = 0;
+out:
+	return ret;
 }
 
-static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct btrfs_path *path,
-					   u64 bytenr, u64 parent,
-					   u64 root_objectid, u64 owner,
-					   u64 offset, int refs_to_add)
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid,
+					  int refs_to_add)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	u32 size;
+	struct btrfs_extent_ref *ref;
 	u32 num_refs;
 	int ret;
 
 	key.objectid = bytenr;
-	if (parent) {
-		key.type = BTRFS_SHARED_DATA_REF_KEY;
-		key.offset = parent;
-		size = sizeof(struct btrfs_shared_data_ref);
-	} else {
-		key.type = BTRFS_EXTENT_DATA_REF_KEY;
-		key.offset = hash_extent_data_ref(root_objectid,
-						  owner, offset);
-		size = sizeof(struct btrfs_extent_data_ref);
-	}
-
-	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
-	if (ret && ret != -EEXIST)
-		goto fail;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
 
-	leaf = path->nodes[0];
-	if (parent) {
-		struct btrfs_shared_data_ref *ref;
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+	if (ret == 0) {
+		leaf = path->nodes[0];
 		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_shared_data_ref);
-		if (ret == 0) {
-			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
-		} else {
-			num_refs = btrfs_shared_data_ref_count(leaf, ref);
-			num_refs += refs_to_add;
-			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
+				     struct btrfs_extent_ref);
+		btrfs_set_ref_root(leaf, ref, ref_root);
+		btrfs_set_ref_generation(leaf, ref, ref_generation);
+		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
+	} else if (ret == -EEXIST) {
+		u64 existing_owner;
+
+		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		if (btrfs_ref_root(leaf, ref) != ref_root ||
+		    btrfs_ref_generation(leaf, ref) != ref_generation) {
+			ret = -EIO;
+			WARN_ON(1);
+			goto out;
 		}
-	} else {
-		struct btrfs_extent_data_ref *ref;
-		while (ret == -EEXIST) {
-			ref = btrfs_item_ptr(leaf, path->slots[0],
-					     struct btrfs_extent_data_ref);
-			if (match_extent_data_ref(leaf, ref, root_objectid,
-						  owner, offset))
-				break;
-			btrfs_release_path(root, path);
-			key.offset++;
-			ret = btrfs_insert_empty_item(trans, root, path, &key,
-						      size);
-			if (ret && ret != -EEXIST)
-				goto fail;
 
-			leaf = path->nodes[0];
-		}
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_data_ref);
-		if (ret == 0) {
-			btrfs_set_extent_data_ref_root(leaf, ref,
-						       root_objectid);
-			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
-			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
-			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
-		} else {
-			num_refs = btrfs_extent_data_ref_count(leaf, ref);
-			num_refs += refs_to_add;
-			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
+		num_refs = btrfs_ref_num_refs(leaf, ref);
+		BUG_ON(num_refs == 0);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
+
+		existing_owner = btrfs_ref_objectid(leaf, ref);
+		if (existing_owner != owner_objectid &&
+		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+			btrfs_set_ref_objectid(leaf, ref,
+					BTRFS_MULTIPLE_OBJECTIDS);
 		}
+		ret = 0;
+	} else {
+		goto out;
 	}
-	btrfs_mark_buffer_dirty(leaf);
-	ret = 0;
-fail:
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
 	btrfs_release_path(root, path);
 	return ret;
 }
 
-static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct btrfs_path *path,
-					   int refs_to_drop)
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  int refs_to_drop)
 {
-	struct btrfs_key key;
-	struct btrfs_extent_data_ref *ref1 = NULL;
-	struct btrfs_shared_data_ref *ref2 = NULL;
 	struct extent_buffer *leaf;
-	u32 num_refs = 0;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
 	int ret = 0;
 
 	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
-	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-		ref1 = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_extent_data_ref);
-		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
-	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-		ref2 = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_shared_data_ref);
-		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
-		struct btrfs_extent_ref_v0 *ref0;
-		ref0 = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_extent_ref_v0);
-		num_refs = btrfs_ref_count_v0(leaf, ref0);
-#endif
-	} else {
-		BUG();
-	}
-
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	num_refs = btrfs_ref_num_refs(leaf, ref);
 	BUG_ON(num_refs < refs_to_drop);
 	num_refs -= refs_to_drop;
-
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
-		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
-			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
-		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
-			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		else {
-			struct btrfs_extent_ref_v0 *ref0;
-			ref0 = btrfs_item_ptr(leaf, path->slots[0],
-					struct btrfs_extent_ref_v0);
-			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
-		}
-#endif
+		btrfs_set_ref_num_refs(leaf, ref, num_refs);
 		btrfs_mark_buffer_dirty(leaf);
 	}
+	btrfs_release_path(root, path);
 	return ret;
 }
 
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  struct btrfs_extent_inline_ref *iref)
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
 {
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_data_ref *ref1;
-	struct btrfs_shared_data_ref *ref2;
-	u32 num_refs = 0;
-
-	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-	if (iref) {
-		if (btrfs_extent_inline_ref_type(leaf, iref) ==
-		    BTRFS_EXTENT_DATA_REF_KEY) {
-			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
-			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
-		} else {
-			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
-			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
-		}
-	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-		ref1 = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_extent_data_ref);
-		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
-	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-		ref2 = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_shared_data_ref);
-		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
-		struct btrfs_extent_ref_v0 *ref0;
-		ref0 = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_extent_ref_v0);
-		num_refs = btrfs_ref_count_v0(leaf, ref0);
-#endif
-	} else {
-		WARN_ON(1);
-	}
-	return num_refs;
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
 }
+#endif
 
-static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, u64 parent,
-					  u64 root_objectid)
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+				u64 num_bytes)
 {
-	struct btrfs_key key;
+#ifdef BIO_RW_DISCARD
 	int ret;
+	u64 map_length = num_bytes;
+	struct btrfs_multi_bio *multi = NULL;
 
-	key.objectid = bytenr;
-	if (parent) {
-		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
-		key.offset = parent;
-	} else {
-		key.type = BTRFS_TREE_BLOCK_REF_KEY;
-		key.offset = root_objectid;
-	}
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+			      bytenr, &map_length, &multi, 0);
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = multi->stripes;
+		int i;
 
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret > 0)
-		ret = -ENOENT;
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (ret == -ENOENT && parent) {
-		btrfs_release_path(root, path);
-		key.type = BTRFS_EXTENT_REF_V0_KEY;
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret > 0)
-			ret = -ENOENT;
+		if (map_length > num_bytes)
+			map_length = num_bytes;
+
+		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			btrfs_issue_discard(stripe->dev->bdev,
+					    stripe->physical,
+					    map_length);
+		}
+		kfree(multi);
 	}
-#endif
+
 	return ret;
+#else
+	return 0;
+#endif
 }
 
-static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, u64 parent,
-					  u64 root_objectid)
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 bytenr,
+				     u64 num_bytes,
+				     u64 orig_parent, u64 parent,
+				     u64 orig_root, u64 ref_root,
+				     u64 orig_generation, u64 ref_generation,
+				     u64 owner_objectid)
 {
-	struct btrfs_key key;
 	int ret;
+	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-	key.objectid = bytenr;
-	if (parent) {
-		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
-		key.offset = parent;
-	} else {
-		key.type = BTRFS_TREE_BLOCK_REF_KEY;
-		key.offset = root_objectid;
-	}
-
-	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-	btrfs_release_path(root, path);
+	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+				       orig_parent, parent, orig_root,
+				       ref_root, orig_generation,
+				       ref_generation, owner_objectid, pin);
+	BUG_ON(ret);
 	return ret;
 }
 
-static inline int extent_ref_type(u64 parent, u64 owner)
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u64 orig_parent, u64 parent,
+			    u64 ref_root, u64 ref_generation,
+			    u64 owner_objectid)
 {
-	int type;
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		if (parent > 0)
-			type = BTRFS_SHARED_BLOCK_REF_KEY;
-		else
-			type = BTRFS_TREE_BLOCK_REF_KEY;
-	} else {
-		if (parent > 0)
-			type = BTRFS_SHARED_DATA_REF_KEY;
-		else
-			type = BTRFS_EXTENT_DATA_REF_KEY;
-	}
-	return type;
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+					orig_parent, parent, ref_root,
+					ref_root, ref_generation,
+					ref_generation, owner_objectid);
+	return ret;
 }
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root, u64 bytenr,
+				  u64 num_bytes,
+				  u64 orig_parent, u64 parent,
+				  u64 orig_root, u64 ref_root,
+				  u64 orig_generation, u64 ref_generation,
+				  u64 owner_objectid)
+{
+	int ret;
 
-static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+				    ref_generation, owner_objectid,
+				    BTRFS_ADD_DELAYED_REF, 0);
+	BUG_ON(ret);
+	return ret;
+}
 
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 bytenr,
+			  u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid,
+			  int refs_to_add)
 {
-	int level;
-	BUG_ON(!path->keep_locks);
-	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-		if (!path->nodes[level])
-			break;
-		btrfs_assert_tree_locked(path->nodes[level]);
-		if (path->slots[level] + 1 >=
-		    btrfs_header_nritems(path->nodes[level]))
-			continue;
-		if (level == 0)
-			btrfs_item_key_to_cpu(path->nodes[level], key,
-					      path->slots[level] + 1);
-		else
-			btrfs_node_key_to_cpu(path->nodes[level], key,
-					      path->slots[level] + 1);
-		return 0;
-	}
-	return 1;
-}
-
-/*
- * look for inline back ref. if back ref is found, *ref_ret is set
- * to the address of inline back ref, and 0 is returned.
- *
- * if back ref isn't found, *ref_ret is set to the address where it
- * should be inserted, and -ENOENT is returned.
- *
- * if insert is true and there are too many inline back refs, the path
- * points to the extent item, and -EAGAIN is returned.
- *
- * NOTE: inline back refs are ordered in the same way that back ref
- *	 items in the tree are ordered.
- */
-static noinline_for_stack
-int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_extent_inline_ref **ref_ret,
-				 u64 bytenr, u64 num_bytes,
-				 u64 parent, u64 root_objectid,
-				 u64 owner, u64 offset, int insert)
-{
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_inline_ref *iref;
-	u64 flags;
-	u64 item_size;
-	unsigned long ptr;
-	unsigned long end;
-	int extra_size;
-	int type;
-	int want;
+	struct btrfs_path *path;
 	int ret;
-	int err = 0;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
+	u32 refs;
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.offset = num_bytes;
 
-	want = extent_ref_type(parent, owner);
-	if (insert) {
-		extra_size = btrfs_extent_inline_ref_size(want);
-		path->keep_locks = 1;
-	} else
-		extra_size = -1;
-	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+	/* first find the extent item and update its reference count */
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
 	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	BUG_ON(ret);
-
-	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (item_size < sizeof(*ei)) {
-		if (!insert) {
-			err = -ENOENT;
-			goto out;
-		}
-		ret = convert_extent_item_v0(trans, root, path, owner,
-					     extra_size);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-	}
-#endif
-	BUG_ON(item_size < sizeof(*ei));
-
-	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	flags = btrfs_extent_flags(leaf, ei);
-
-	ptr = (unsigned long)(ei + 1);
-	end = (unsigned long)ei + item_size;
-
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		ptr += sizeof(struct btrfs_tree_block_info);
-		BUG_ON(ptr > end);
-	} else {
-		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
-	}
-
-	err = -ENOENT;
-	while (1) {
-		if (ptr >= end) {
-			WARN_ON(ptr > end);
-			break;
-		}
-		iref = (struct btrfs_extent_inline_ref *)ptr;
-		type = btrfs_extent_inline_ref_type(leaf, iref);
-		if (want < type)
-			break;
-		if (want > type) {
-			ptr += btrfs_extent_inline_ref_size(type);
-			continue;
-		}
-
-		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-			struct btrfs_extent_data_ref *dref;
-			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
-			if (match_extent_data_ref(leaf, dref, root_objectid,
-						  owner, offset)) {
-				err = 0;
-				break;
-			}
-			if (hash_extent_data_ref_item(leaf, dref) <
-			    hash_extent_data_ref(root_objectid, owner, offset))
-				break;
-		} else {
-			u64 ref_offset;
-			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
-			if (parent > 0) {
-				if (parent == ref_offset) {
-					err = 0;
-					break;
-				}
-				if (ref_offset < parent)
-					break;
-			} else {
-				if (root_objectid == ref_offset) {
-					err = 0;
-					break;
-				}
-				if (ref_offset < root_objectid)
-					break;
-			}
-		}
-		ptr += btrfs_extent_inline_ref_size(type);
-	}
-	if (err == -ENOENT && insert) {
-		if (item_size + extra_size >=
-		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
-			err = -EAGAIN;
-			goto out;
-		}
-		/*
-		 * To add new inline back ref, we have to make sure
-		 * there is no corresponding back ref item.
-		 * For simplicity, we just do not add new inline back
-		 * ref if there is any kind of item for this block
-		 */
-		if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
-		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			err = -EAGAIN;
-			goto out;
-		}
-	}
-	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
-out:
-	if (insert) {
-		path->keep_locks = 0;
-		btrfs_unlock_up_safe(path, 1);
-	}
-	return err;
-}
-
-/*
- * helper to add new inline back ref
- */
-static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_extent_inline_ref *iref,
-				u64 parent, u64 root_objectid,
-				u64 owner, u64 offset, int refs_to_add,
-				struct btrfs_delayed_extent_op *extent_op)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_extent_item *ei;
-	unsigned long ptr;
-	unsigned long end;
-	unsigned long item_offset;
-	u64 refs;
-	int size;
-	int type;
-	int ret;
-
-	leaf = path->nodes[0];
-	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	item_offset = (unsigned long)iref - (unsigned long)ei;
-
-	type = extent_ref_type(parent, owner);
-	size = btrfs_extent_inline_ref_size(type);
-
-	ret = btrfs_extend_item(trans, root, path, size);
-	BUG_ON(ret);
-
-	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, ei);
-	refs += refs_to_add;
-	btrfs_set_extent_refs(leaf, ei, refs);
-	if (extent_op)
-		__run_delayed_extent_op(extent_op, leaf, ei);
-
-	ptr = (unsigned long)ei + item_offset;
-	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
-	if (ptr < end - size)
-		memmove_extent_buffer(leaf, ptr + size, ptr,
-				      end - size - ptr);
-
-	iref = (struct btrfs_extent_inline_ref *)ptr;
-	btrfs_set_extent_inline_ref_type(leaf, iref, type);
-	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-		struct btrfs_extent_data_ref *dref;
-		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
-		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
-		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
-		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
-		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
-	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-		struct btrfs_shared_data_ref *sref;
-		sref = (struct btrfs_shared_data_ref *)(iref + 1);
-		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
-	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
-		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
-	} else {
-		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
-	}
-	btrfs_mark_buffer_dirty(leaf);
-	return 0;
-}
-
-static int lookup_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_extent_inline_ref **ref_ret,
-				 u64 bytenr, u64 num_bytes, u64 parent,
-				 u64 root_objectid, u64 owner, u64 offset)
-{
-	int ret;
-
-	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
-					   bytenr, num_bytes, parent,
-					   root_objectid, owner, offset, 0);
-	if (ret != -ENOENT)
+		btrfs_set_path_blocking(path);
 		return ret;
-
-	btrfs_release_path(root, path);
-	*ref_ret = NULL;
-
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
-					    root_objectid);
-	} else {
-		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
-					     root_objectid, owner, offset);
-	}
-	return ret;
-}
-
-/*
- * helper to update/remove inline back ref
- */
-static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_extent_inline_ref *iref,
-				 int refs_to_mod,
-				 struct btrfs_delayed_extent_op *extent_op)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_data_ref *dref = NULL;
-	struct btrfs_shared_data_ref *sref = NULL;
-	unsigned long ptr;
-	unsigned long end;
-	u32 item_size;
-	int size;
-	int type;
-	int ret;
-	u64 refs;
-
-	leaf = path->nodes[0];
-	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, ei);
-	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
-	refs += refs_to_mod;
-	btrfs_set_extent_refs(leaf, ei, refs);
-	if (extent_op)
-		__run_delayed_extent_op(extent_op, leaf, ei);
-
-	type = btrfs_extent_inline_ref_type(leaf, iref);
-
-	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
-		refs = btrfs_extent_data_ref_count(leaf, dref);
-	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-		sref = (struct btrfs_shared_data_ref *)(iref + 1);
-		refs = btrfs_shared_data_ref_count(leaf, sref);
-	} else {
-		refs = 1;
-		BUG_ON(refs_to_mod != -1);
 	}
 
-	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
-	refs += refs_to_mod;
-
-	if (refs > 0) {
-		if (type == BTRFS_EXTENT_DATA_REF_KEY)
-			btrfs_set_extent_data_ref_count(leaf, dref, refs);
-		else
-			btrfs_set_shared_data_ref_count(leaf, sref, refs);
-	} else {
-		size =  btrfs_extent_inline_ref_size(type);
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-		ptr = (unsigned long)iref;
-		end = (unsigned long)ei + item_size;
-		if (ptr + size < end)
-			memmove_extent_buffer(leaf, ptr, ptr + size,
-					      end - ptr - size);
-		item_size -= size;
-		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
-		BUG_ON(ret);
-	}
-	btrfs_mark_buffer_dirty(leaf);
-	return 0;
-}
-
-static noinline_for_stack
-int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 u64 bytenr, u64 num_bytes, u64 parent,
-				 u64 root_objectid, u64 owner,
-				 u64 offset, int refs_to_add,
-				 struct btrfs_delayed_extent_op *extent_op)
-{
-	struct btrfs_extent_inline_ref *iref;
-	int ret;
-
-	ret = lookup_inline_extent_backref(trans, root, path, &iref,
-					   bytenr, num_bytes, parent,
-					   root_objectid, owner, offset, 1);
-	if (ret == 0) {
-		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
-		ret = update_inline_extent_backref(trans, root, path, iref,
-						   refs_to_add, extent_op);
-	} else if (ret == -ENOENT) {
-		ret = setup_inline_extent_backref(trans, root, path, iref,
-						  parent, root_objectid,
-						  owner, offset, refs_to_add,
-						  extent_op);
-	}
-	return ret;
-}
-
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 u64 bytenr, u64 parent, u64 root_objectid,
-				 u64 owner, u64 offset, int refs_to_add)
-{
-	int ret;
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		BUG_ON(refs_to_add != 1);
-		ret = insert_tree_block_ref(trans, root, path, bytenr,
-					    parent, root_objectid);
-	} else {
-		ret = insert_extent_data_ref(trans, root, path, bytenr,
-					     parent, root_objectid,
-					     owner, offset, refs_to_add);
-	}
-	return ret;
-}
-
-static int remove_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_extent_inline_ref *iref,
-				 int refs_to_drop, int is_data)
-{
-	int ret;
-
-	BUG_ON(!is_data && refs_to_drop != 1);
-	if (iref) {
-		ret = update_inline_extent_backref(trans, root, path, iref,
-						   -refs_to_drop, NULL);
-	} else if (is_data) {
-		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
-	} else {
-		ret = btrfs_del_item(trans, root, path);
+	if (ret > 0) {
+		WARN_ON(1);
+		btrfs_free_path(path);
+		return -EIO;
 	}
-	return ret;
-}
-
-#ifdef BIO_RW_DISCARD
-static void btrfs_issue_discard(struct block_device *bdev,
-				u64 start, u64 len)
-{
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
-}
-#endif
-
-static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
-				u64 num_bytes)
-{
-#ifdef BIO_RW_DISCARD
-	int ret;
-	u64 map_length = num_bytes;
-	struct btrfs_multi_bio *multi = NULL;
-
-	/* Tell the block device(s) that the sectors can be discarded */
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-			      bytenr, &map_length, &multi, 0);
-	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
-		int i;
+	l = path->nodes[0];
 
-		if (map_length > num_bytes)
-			map_length = num_bytes;
-
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
-			btrfs_issue_discard(stripe->dev->bdev,
-					    stripe->physical,
-					    map_length);
-		}
-		kfree(multi);
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	if (key.objectid != bytenr) {
+		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)key.objectid);
+		BUG();
 	}
+	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
-	return ret;
-#else
-	return 0;
-#endif
-}
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
 
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset)
-{
-	int ret;
-	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
-	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
+	refs = btrfs_extent_refs(l, item);
+	btrfs_set_extent_refs(l, item, refs + refs_to_add);
+	btrfs_unlock_up_safe(path, 1);
 
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL);
-	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL);
-	}
-	return ret;
-}
-
-static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 bytenr, u64 num_bytes,
-				  u64 parent, u64 root_objectid,
-				  u64 owner, u64 offset, int refs_to_add,
-				  struct btrfs_delayed_extent_op *extent_op)
-{
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_item *item;
-	u64 refs;
-	int ret;
-	int err = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	path->reada = 1;
-	path->leave_spinning = 1;
-	/* this will setup the path even if it fails to insert the back ref */
-	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
-					   path, bytenr, num_bytes, parent,
-					   root_objectid, owner, offset,
-					   refs_to_add, extent_op);
-	if (ret == 0)
-		goto out;
-
-	if (ret != -EAGAIN) {
-		err = ret;
-		goto out;
-	}
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, item);
-	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
-	if (extent_op)
-		__run_delayed_extent_op(extent_op, leaf, item);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 
-	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
@@ -1514,197 +802,56 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
-				    path, bytenr, parent, root_objectid,
-				    owner, offset, refs_to_add);
+				    path, bytenr, parent,
+				    ref_root, ref_generation,
+				    owner_objectid, refs_to_add);
 	BUG_ON(ret);
-out:
 	btrfs_free_path(path);
-	return err;
-}
-
-static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_delayed_ref_node *node,
-				struct btrfs_delayed_extent_op *extent_op,
-				int insert_reserved)
-{
-	int ret = 0;
-	struct btrfs_delayed_data_ref *ref;
-	struct btrfs_key ins;
-	u64 parent = 0;
-	u64 ref_root = 0;
-	u64 flags = 0;
-
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-	ref = btrfs_delayed_node_to_data_ref(node);
-	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
-		parent = ref->parent;
-	else
-		ref_root = ref->root;
-
-	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		if (extent_op) {
-			BUG_ON(extent_op->update_key);
-			flags |= extent_op->flags_to_set;
-		}
-		ret = alloc_reserved_file_extent(trans, root,
-						 parent, ref_root, flags,
-						 ref->objectid, ref->offset,
-						 &ins, node->ref_mod);
-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
-	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-					     node->num_bytes, parent,
-					     ref_root, ref->objectid,
-					     ref->offset, node->ref_mod,
-					     extent_op);
-	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, root, node->bytenr,
-					  node->num_bytes, parent,
-					  ref_root, ref->objectid,
-					  ref->offset, node->ref_mod,
-					  extent_op);
-	} else {
-		BUG();
-	}
-	return ret;
-}
-
-static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
-				    struct extent_buffer *leaf,
-				    struct btrfs_extent_item *ei)
-{
-	u64 flags = btrfs_extent_flags(leaf, ei);
-	if (extent_op->update_flags) {
-		flags |= extent_op->flags_to_set;
-		btrfs_set_extent_flags(leaf, ei, flags);
-	}
-
-	if (extent_op->update_key) {
-		struct btrfs_tree_block_info *bi;
-		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
-		bi = (struct btrfs_tree_block_info *)(ei + 1);
-		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
-	}
+	return 0;
 }
 
-static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_delayed_ref_node *node,
-				 struct btrfs_delayed_extent_op *extent_op)
-{
-	struct btrfs_key key;
-	struct btrfs_path *path;
-	struct btrfs_extent_item *ei;
-	struct extent_buffer *leaf;
-	u32 item_size;
-	int ret;
-	int err = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = node->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = node->num_bytes;
-
-	path->reada = 1;
-	path->leave_spinning = 1;
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
-				path, 0, 1);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret > 0) {
-		err = -EIO;
-		goto out;
-	}
-
-	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (item_size < sizeof(*ei)) {
-		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
-					     path, (u64)-1, 0);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-	}
-#endif
-	BUG_ON(item_size < sizeof(*ei));
-	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-	__run_delayed_extent_op(extent_op, leaf, ei);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 ref_root, u64 ref_generation,
+			 u64 owner_objectid)
+{
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
 
-	btrfs_mark_buffer_dirty(leaf);
-out:
-	btrfs_free_path(path);
-	return err;
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
+				     0, ref_root, 0, ref_generation,
+				     owner_objectid);
+	return ret;
 }
 
-static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_delayed_ref_node *node,
-				struct btrfs_delayed_extent_op *extent_op,
-				int insert_reserved)
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_ref_node *node)
 {
 	int ret = 0;
-	struct btrfs_delayed_tree_ref *ref;
-	struct btrfs_key ins;
-	u64 parent = 0;
-	u64 ref_root = 0;
+	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
 
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	BUG_ON(node->ref_mod == 0);
+	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+				  node->parent, ref->root, ref->generation,
+				  ref->owner_objectid, ref->pin, node->ref_mod);
 
-	ref = btrfs_delayed_node_to_tree_ref(node);
-	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-		parent = ref->parent;
-	else
-		ref_root = ref->root;
-
-	BUG_ON(node->ref_mod != 1);
-	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		BUG_ON(!extent_op || !extent_op->update_flags ||
-		       !extent_op->update_key);
-		ret = alloc_reserved_tree_block(trans, root,
-						parent, ref_root,
-						extent_op->flags_to_set,
-						&extent_op->key,
-						ref->level, &ins);
-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
-	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-					     node->num_bytes, parent, ref_root,
-					     ref->level, 0, 1, extent_op);
-	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, root, node->bytenr,
-					  node->num_bytes, parent, ref_root,
-					  ref->level, 0, 1, extent_op);
-	} else {
-		BUG();
-	}
 	return ret;
 }
 
-
 /* helper function to actually process a single delayed ref entry */
-static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct btrfs_delayed_ref_node *node,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int insert_reserved)
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_ref_node *node,
+					int insert_reserved)
 {
 	int ret;
-	if (btrfs_delayed_ref_is_head(node)) {
+	struct btrfs_delayed_ref *ref;
+
+	if (node->parent == (u64)-1) {
 		struct btrfs_delayed_ref_head *head;
 		/*
 		 * we've hit the end of the chain and we were supposed
@@ -1712,35 +859,44 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		 * deleted before we ever needed to insert it, so all
 		 * we have to do is clean up the accounting
 		 */
-		BUG_ON(extent_op);
-		head = btrfs_delayed_node_to_head(node);
 		if (insert_reserved) {
-			if (head->is_data) {
-				ret = btrfs_del_csums(trans, root,
-						      node->bytenr,
-						      node->num_bytes);
-				BUG_ON(ret);
-			}
-			btrfs_update_pinned_extents(root, node->bytenr,
-						    node->num_bytes, 1);
 			update_reserved_extents(root, node->bytenr,
 						node->num_bytes, 0);
 		}
+		head = btrfs_delayed_node_to_head(node);
 		mutex_unlock(&head->mutex);
 		return 0;
 	}
 
-	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-		ret = run_delayed_tree_ref(trans, root, node, extent_op,
-					   insert_reserved);
-	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
-		 node->type == BTRFS_SHARED_DATA_REF_KEY)
-		ret = run_delayed_data_ref(trans, root, node, extent_op,
-					   insert_reserved);
-	else
-		BUG();
-	return ret;
+	ref = btrfs_delayed_node_to_ref(node);
+	if (ref->action == BTRFS_ADD_DELAYED_REF) {
+		if (insert_reserved) {
+			struct btrfs_key ins;
+
+			ins.objectid = node->bytenr;
+			ins.offset = node->num_bytes;
+			ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+			/* record the full extent allocation */
+			ret = __btrfs_alloc_reserved_extent(trans, root,
+					node->parent, ref->root,
+					ref->generation, ref->owner_objectid,
+					&ins, node->ref_mod);
+			update_reserved_extents(root, node->bytenr,
+						node->num_bytes, 0);
+		} else {
+			/* just add one backref */
+			ret = add_extent_ref(trans, root, node->bytenr,
+				     node->num_bytes,
+				     node->parent, ref->root, ref->generation,
+				     ref->owner_objectid, node->ref_mod);
+		}
+		BUG_ON(ret);
+	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+		WARN_ON(insert_reserved);
+		ret = drop_delayed_ref(trans, root, node);
+	}
+	return 0;
 }
 
 static noinline struct btrfs_delayed_ref_node *
@@ -1763,7 +919,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 				rb_node);
 		if (ref->bytenr != head->node.bytenr)
 			break;
-		if (ref->action == action)
+		if (btrfs_delayed_node_to_ref(ref)->action == action)
 			return ref;
 		node = rb_prev(node);
 	}
@@ -1781,7 +937,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
-	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 	int count = 0;
 	int must_insert_reserved = 0;
@@ -1820,9 +975,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		must_insert_reserved = locked_ref->must_insert_reserved;
 		locked_ref->must_insert_reserved = 0;
 
-		extent_op = locked_ref->extent_op;
-		locked_ref->extent_op = NULL;
-
 		/*
 		 * locked_ref is the head node, so we have to go one
 		 * node back for any delayed ref updates
@@ -1834,25 +986,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			 * so that any accounting fixes can happen
 			 */
 			ref = &locked_ref->node;
-
-			if (extent_op && must_insert_reserved) {
-				kfree(extent_op);
-				extent_op = NULL;
-			}
-
-			if (extent_op) {
-				spin_unlock(&delayed_refs->lock);
-
-				ret = run_delayed_extent_op(trans, root,
-							    ref, extent_op);
-				BUG_ON(ret);
-				kfree(extent_op);
-
-				cond_resched();
-				spin_lock(&delayed_refs->lock);
-				continue;
-			}
-
 			list_del_init(&locked_ref->cluster);
 			locked_ref = NULL;
 		}
@@ -1860,17 +993,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-
 		spin_unlock(&delayed_refs->lock);
 
-		ret = run_one_delayed_ref(trans, root, ref, extent_op,
+		ret = run_one_delayed_ref(trans, root, ref,
 					  must_insert_reserved);
 		BUG_ON(ret);
-
 		btrfs_put_delayed_ref(ref);
-		kfree(extent_op);
-		count++;
 
+		count++;
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -1965,112 +1095,25 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data)
-{
-	struct btrfs_delayed_extent_op *extent_op;
-	int ret;
-
-	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-	if (!extent_op)
-		return -ENOMEM;
-
-	extent_op->flags_to_set = flags;
-	extent_op->update_flags = 1;
-	extent_op->update_key = 0;
-	extent_op->is_data = is_data ? 1 : 0;
-
-	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
-	if (ret)
-		kfree(extent_op);
-	return ret;
-}
-
-static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      u64 objectid, u64 offset, u64 bytenr)
-{
-	struct btrfs_delayed_ref_head *head;
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_data_ref *data_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *node;
-	int ret = 0;
-
-	ret = -ENOENT;
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(trans, bytenr);
-	if (!head)
-		goto out;
-
-	if (!mutex_trylock(&head->mutex)) {
-		atomic_inc(&head->node.refs);
-		spin_unlock(&delayed_refs->lock);
-
-		btrfs_release_path(root->fs_info->extent_root, path);
-
-		mutex_lock(&head->mutex);
-		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(&head->node);
-		return -EAGAIN;
-	}
-
-	node = rb_prev(&head->node.rb_node);
-	if (!node)
-		goto out_unlock;
-
-	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-	if (ref->bytenr != bytenr)
-		goto out_unlock;
-
-	ret = 1;
-	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
-		goto out_unlock;
-
-	data_ref = btrfs_delayed_node_to_data_ref(ref);
-
-	node = rb_prev(node);
-	if (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (ref->bytenr == bytenr)
-			goto out_unlock;
-	}
-
-	if (data_ref->root != root->root_key.objectid ||
-	    data_ref->objectid != objectid || data_ref->offset != offset)
-		goto out_unlock;
-
-	ret = 0;
-out_unlock:
-	mutex_unlock(&head->mutex);
-out:
-	spin_unlock(&delayed_refs->lock);
-	return ret;
-}
-
-static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_path *path,
-					u64 objectid, u64 offset, u64 bytenr)
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 objectid, u64 bytenr)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_data_ref *ref;
-	struct btrfs_extent_inline_ref *iref;
-	struct btrfs_extent_item *ei;
+	struct btrfs_extent_ref *ref_item;
 	struct btrfs_key key;
-	u32 item_size;
+	struct btrfs_key found_key;
+	u64 ref_root;
+	u64 last_snapshot;
+	u32 nritems;
 	int ret;
 
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
+	path = btrfs_alloc_path();
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -2082,83 +1125,55 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
 
 	path->slots[0]--;
 	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
-	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
-		goto out;
-
-	ret = 1;
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (item_size < sizeof(*ei)) {
-		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
-		goto out;
-	}
-#endif
-	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-
-	if (item_size != sizeof(*ei) +
-	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
-		goto out;
-
-	if (btrfs_extent_generation(leaf, ei) <=
-	    btrfs_root_last_snapshot(&root->root_item))
-		goto out;
-
-	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-	if (btrfs_extent_inline_ref_type(leaf, iref) !=
-	    BTRFS_EXTENT_DATA_REF_KEY)
-		goto out;
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
-	if (btrfs_extent_refs(leaf, ei) !=
-	    btrfs_extent_data_ref_count(leaf, ref) ||
-	    btrfs_extent_data_ref_root(leaf, ref) !=
-	    root->root_key.objectid ||
-	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
-	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+	if (found_key.objectid != bytenr ||
+	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
 		goto out;
 
-	ret = 0;
-out:
-	return ret;
-}
-
-int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  u64 objectid, u64 offset, u64 bytenr)
-{
-	struct btrfs_path *path;
-	int ret;
-	int ret2;
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				continue;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr)
+			break;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOENT;
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+			path->slots[0]++;
+			continue;
+		}
 
-	do {
-		ret = check_committed_ref(trans, root, path, objectid,
-					  offset, bytenr);
-		if (ret && ret != -ENOENT)
+		ref_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_extent_ref);
+		ref_root = btrfs_ref_root(leaf, ref_item);
+		if ((ref_root != root->root_key.objectid &&
+		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+			ret = 1;
 			goto out;
+		}
+		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+			ret = 1;
+			goto out;
+		}
 
-		ret2 = check_delayed_ref(trans, root, path, objectid,
-					 offset, bytenr);
-	} while (ret2 == -EAGAIN);
-
-	if (ret2 && ret2 != -ENOENT) {
-		ret = ret2;
-		goto out;
+		path->slots[0]++;
 	}
-
-	if (ret != -ENOENT || ret2 != -ENOENT)
-		ret = 0;
+	ret = 0;
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
-#if 0
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		    struct extent_buffer *buf, u32 nr_extents)
 {
@@ -2276,49 +1291,191 @@ static int refsort_cmp(const void *a_void, const void *b_void)
 		return 1;
 	return 0;
 }
-#endif
 
-static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
+
+noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
-			   struct extent_buffer *buf,
-			   int full_backref, int inc)
+			   struct extent_buffer *orig_buf,
+			   struct extent_buffer *buf, u32 *nr_extents)
 {
 	u64 bytenr;
-	u64 num_bytes;
-	u64 parent;
 	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	struct refsort *sorted;
 	u32 nritems;
+	u32 nr_file_extents = 0;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int level;
 	int ret = 0;
+	int faili = 0;
+	int refi = 0;
+	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
-	if (!root->ref_cows && level == 0)
-		return 0;
+	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+	BUG_ON(!sorted);
 
-	if (inc)
-		process_func = btrfs_inc_extent_ref;
-	else
-		process_func = btrfs_free_extent;
+	if (root->ref_cows) {
+		process_func = __btrfs_inc_extent_ref;
+	} else {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		process_func = __btrfs_update_extent_ref;
+	}
+
+	/*
+	 * we make two passes through the items.  In the first pass we
+	 * only record the byte number and slot.  Then we sort based on
+	 * byte number and do the actual work based on the sorted results
+	 */
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+
+			nr_file_extents++;
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		}
+	}
+	/*
+	 * if refi == 0, we didn't actually put anything into the sorted
+	 * array and we're done
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		cond_resched();
+		slot = sorted[i].slot;
+		bytenr = sorted[i].bytenr;
+
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
+			fi = btrfs_item_ptr(buf, slot,
+					    struct btrfs_file_extent_item);
+
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+
+			ret = process_func(trans, root, bytenr,
+				   btrfs_file_extent_disk_num_bytes(buf, fi),
+				   orig_buf->start, buf->start,
+				   orig_root, ref_root,
+				   orig_generation, ref_generation,
+				   key.objectid);
+
+			if (ret) {
+				faili = slot;
+				WARN_ON(1);
+				goto fail;
+			}
+		} else {
+			ret = process_func(trans, root, bytenr, buf->len,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   level - 1);
+			if (ret) {
+				faili = slot;
+				WARN_ON(1);
+				goto fail;
+			}
+		}
+	}
+out:
+	kfree(sorted);
+	if (nr_extents) {
+		if (level == 0)
+			*nr_extents = nr_file_extents;
+		else
+			*nr_extents = nritems;
+	}
+	return 0;
+fail:
+	kfree(sorted);
+	WARN_ON(1);
+	return ret;
+}
+
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr)
+
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int ret;
+	int slot;
+	int level;
+
+	BUG_ON(start_slot < 0);
+	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+	level = btrfs_header_level(buf);
 
-	if (full_backref)
-		parent = buf->start;
-	else
-		parent = 0;
+	if (!root->ref_cows) {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+	}
 
-	for (i = 0; i < nritems; i++) {
+	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+		cond_resched();
 		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, i);
+			btrfs_item_key_to_cpu(buf, &key, slot);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
-			fi = btrfs_item_ptr(buf, i,
+			fi = btrfs_item_ptr(buf, slot,
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
@@ -2326,39 +1483,28 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (bytenr == 0)
 				continue;
-
-			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
-			key.offset -= btrfs_file_extent_offset(buf, fi);
-			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, key.objectid,
-					   key.offset);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+				    btrfs_file_extent_disk_num_bytes(buf, fi),
+				    orig_buf->start, buf->start,
+				    orig_root, ref_root, orig_generation,
+				    ref_generation, key.objectid);
 			if (ret)
 				goto fail;
 		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
-			num_bytes = btrfs_level_size(root, level - 1);
-			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0);
+			bytenr = btrfs_node_blockptr(buf, slot);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    buf->len, orig_buf->start,
+					    buf->start, orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    level - 1);
 			if (ret)
 				goto fail;
 		}
 	}
 	return 0;
 fail:
-	BUG();
-	return ret;
-}
-
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
-{
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
-}
-
-int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
-{
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+	WARN_ON(1);
+	return -1;
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2861,24 +2007,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	/* block accounting for super block */
-	spin_lock(&info->delalloc_lock);
-	old_val = btrfs_super_bytes_used(&info->super_copy);
-	if (alloc)
-		old_val += num_bytes;
-	else
-		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
-
-	/* block accounting for root item */
-	old_val = btrfs_root_used(&root->root_item);
-	if (alloc)
-		old_val += num_bytes;
-	else
-		old_val -= num_bytes;
-	btrfs_set_root_used(&root->root_item, old_val);
-	spin_unlock(&info->delalloc_lock);
-
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
@@ -3088,6 +2216,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_owner = btrfs_header_owner(buf);
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			*must_clean = buf;
@@ -3105,77 +2235,63 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 parent,
-				u64 root_objectid, u64 owner_objectid,
-				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extent_op)
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __free_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner_objectid, int pin, int mark_free,
+			 int refs_to_drop)
 {
-	struct btrfs_key key;
 	struct btrfs_path *path;
+	struct btrfs_key key;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_inline_ref *iref;
 	int ret;
-	int is_data;
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
-	u32 item_size;
-	u64 refs;
+	struct btrfs_extent_item *ei;
+	u32 refs;
 
+	key.objectid = bytenr;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	key.offset = num_bytes;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	path->reada = 1;
 	path->leave_spinning = 1;
-
-	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
-	BUG_ON(!is_data && refs_to_drop != 1);
-
-	ret = lookup_extent_backref(trans, extent_root, path, &iref,
-				    bytenr, num_bytes, parent,
-				    root_objectid, owner_objectid,
-				    owner_offset);
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, parent, root_objectid,
+				    ref_generation, owner_objectid, 1);
 	if (ret == 0) {
+		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
-		while (extent_slot >= 0) {
-			btrfs_item_key_to_cpu(path->nodes[0], &key,
+		while (extent_slot > 0) {
+			extent_slot--;
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 					      extent_slot);
-			if (key.objectid != bytenr)
+			if (found_key.objectid != bytenr)
 				break;
-			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
-			    key.offset == num_bytes) {
+			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    found_key.offset == num_bytes) {
 				found_extent = 1;
 				break;
 			}
 			if (path->slots[0] - extent_slot > 5)
 				break;
-			extent_slot--;
 		}
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
-		if (found_extent && item_size < sizeof(*ei))
-			found_extent = 0;
-#endif
 		if (!found_extent) {
-			BUG_ON(iref);
 			ret = remove_extent_backref(trans, extent_root, path,
-						    NULL, refs_to_drop,
-						    is_data);
+						    refs_to_drop);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
 			path->leave_spinning = 1;
-
-			key.objectid = bytenr;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-			key.offset = num_bytes;
-
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
 			if (ret) {
@@ -3191,98 +2307,82 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu  owner %llu offset %llu\n",
+		       "parent %llu root %llu gen %llu owner %llu\n",
 		       (unsigned long long)bytenr,
 		       (unsigned long long)parent,
 		       (unsigned long long)root_objectid,
-		       (unsigned long long)owner_objectid,
-		       (unsigned long long)owner_offset);
+		       (unsigned long long)ref_generation,
+		       (unsigned long long)owner_objectid);
 	}
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, extent_slot);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (item_size < sizeof(*ei)) {
-		BUG_ON(found_extent || extent_slot != path->slots[0]);
-		ret = convert_extent_item_v0(trans, extent_root, path,
-					     owner_objectid, 0);
-		BUG_ON(ret < 0);
-
-		btrfs_release_path(extent_root, path);
-		path->leave_spinning = 1;
-
-		key.objectid = bytenr;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = num_bytes;
-
-		ret = btrfs_search_slot(trans, extent_root, &key, path,
-					-1, 1);
-		if (ret) {
-			printk(KERN_ERR "umm, got %d back from search"
-			       ", was looking for %llu\n", ret,
-			       (unsigned long long)bytenr);
-			btrfs_print_leaf(extent_root, path->nodes[0]);
-		}
-		BUG_ON(ret);
-		extent_slot = path->slots[0];
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, extent_slot);
-	}
-#endif
-	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-		struct btrfs_tree_block_info *bi;
-		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
-		bi = (struct btrfs_tree_block_info *)(ei + 1);
-		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
-	}
-
 	refs = btrfs_extent_refs(leaf, ei);
+
+	/*
+	 * we're not allowed to delete the extent item if there
+	 * are other delayed ref updates pending
+	 */
+
 	BUG_ON(refs < refs_to_drop);
 	refs -= refs_to_drop;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	btrfs_mark_buffer_dirty(leaf);
 
-	if (refs > 0) {
-		if (extent_op)
-			__run_delayed_extent_op(extent_op, leaf, ei);
-		/*
-		 * In the case of inline back ref, reference count will
-		 * be updated by remove_extent_backref
+	if (refs == 0 && found_extent &&
+	    path->slots[0] == extent_slot + 1) {
+		struct btrfs_extent_ref *ref;
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
+		/* if the back ref and the extent are next to each other
+		 * they get deleted below in one shot
 		 */
-		if (iref) {
-			BUG_ON(!found_extent);
-		} else {
-			btrfs_set_extent_refs(leaf, ei, refs);
-			btrfs_mark_buffer_dirty(leaf);
-		}
-		if (found_extent) {
-			ret = remove_extent_backref(trans, extent_root, path,
-						    iref, refs_to_drop,
-						    is_data);
+		path->slots[0] = extent_slot;
+		num_to_del = 2;
+	} else if (found_extent) {
+		/* otherwise delete the extent back ref */
+		ret = remove_extent_backref(trans, extent_root, path,
+					    refs_to_drop);
+		BUG_ON(ret);
+		/* if refs are 0, we need to setup the path for deletion */
+		if (refs == 0) {
+			btrfs_release_path(extent_root, path);
+			path->leave_spinning = 1;
+			ret = btrfs_search_slot(trans, extent_root, &key, path,
+						-1, 1);
 			BUG_ON(ret);
 		}
-	} else {
-		int mark_free = 0;
+	}
+
+	if (refs == 0) {
+		u64 super_used;
+		u64 root_used;
 		struct extent_buffer *must_clean = NULL;
 
-		if (found_extent) {
-			BUG_ON(is_data && refs_to_drop !=
-			       extent_data_ref_count(root, path, iref));
-			if (iref) {
-				BUG_ON(path->slots[0] != extent_slot);
-			} else {
-				BUG_ON(path->slots[0] != extent_slot + 1);
-				path->slots[0] = extent_slot;
-				num_to_del = 2;
-			}
+		if (pin) {
+			ret = pin_down_bytes(trans, root, path,
+				bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+				&must_clean);
+			if (ret > 0)
+				mark_free = 1;
+			BUG_ON(ret < 0);
 		}
 
-		ret = pin_down_bytes(trans, root, path, bytenr,
-				     num_bytes, is_data, &must_clean);
-		if (ret > 0)
-			mark_free = 1;
-		BUG_ON(ret < 0);
+		/* block accounting for super block */
+		spin_lock(&info->delalloc_lock);
+		super_used = btrfs_super_bytes_used(&info->super_copy);
+		btrfs_set_super_bytes_used(&info->super_copy,
+					   super_used - num_bytes);
+
+		/* block accounting for root item */
+		root_used = btrfs_root_used(&root->root_item);
+		btrfs_set_root_used(&root->root_item,
+					   root_used - num_bytes);
+		spin_unlock(&info->delalloc_lock);
+
 		/*
 		 * it is going to be very rare for someone to be waiting
 		 * on the block we're freeing.  del_items might need to
@@ -3303,7 +2403,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			free_extent_buffer(must_clean);
 		}
 
-		if (is_data) {
+		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
 		} else {
@@ -3320,6 +2420,34 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes, u64 parent,
+					u64 root_objectid, u64 ref_generation,
+					u64 owner_objectid, int pin,
+					int refs_to_drop)
+{
+	WARN_ON(num_bytes < root->sectorsize);
+
+	/*
+	 * if metadata always pin
+	 * if data pin when any transaction has committed this
+	 */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	    ref_generation != trans->transid)
+		pin = 1;
+
+	if (ref_generation != trans->transid)
+		pin = 1;
+
+	return __free_extent(trans, root, bytenr, num_bytes, parent,
+			    root_objectid, ref_generation,
+			    owner_objectid, pin, pin == 0, refs_to_drop);
+}
+
 /*
  * when we free an extent, it is possible (and likely) that we free the last
  * delayed ref for that extent as well.  This searches the delayed ref tree for
@@ -3351,13 +2479,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (ref->bytenr == bytenr)
 		goto out;
 
-	if (head->extent_op) {
-		if (!head->must_insert_reserved)
-			goto out;
-		kfree(head->extent_op);
-		head->extent_op = NULL;
-	}
-
 	/*
 	 * waiting for the lock here would deadlock.  If someone else has it
 	 * locked they are already in the process of dropping it anyway
@@ -3386,8 +2507,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 
 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-				  &head->node, head->extent_op,
-				  head->must_insert_reserved);
+				  &head->node, head->must_insert_reserved);
 	BUG_ON(ret);
 	btrfs_put_delayed_ref(&head->node);
 	return 0;
@@ -3399,32 +2519,32 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset)
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, int pin)
 {
 	int ret;
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
+	 *
+	 * data extents referenced by the tree log do need to have
+	 * their reference counts bumped.
 	 */
-	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
-		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 		update_reserved_extents(root, bytenr, num_bytes, 0);
 		ret = 0;
-	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL);
+	} else {
+		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+				       root_objectid, ref_generation,
+				       owner_objectid,
+				       BTRFS_DROP_DELAYED_REF, 1);
 		BUG_ON(ret);
 		ret = check_ref_cleanup(trans, root, bytenr);
 		BUG_ON(ret);
-	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, owner,
-					offset, BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
 	}
 	return ret;
 }
@@ -3599,7 +2719,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 			last_ptr_loop = 0;
 
 			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(trans, root,
+			ret = btrfs_find_space_cluster(trans,
 					       block_group, last_ptr,
 					       offset, num_bytes,
 					       empty_cluster + empty_size);
@@ -3849,147 +2969,99 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      u64 parent, u64 root_objectid,
-				      u64 flags, u64 owner, u64 offset,
-				      struct btrfs_key *ins, int ref_mod)
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root, u64 parent,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, struct btrfs_key *ins,
+					 int ref_mod)
 {
 	int ret;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 super_used;
+	u64 root_used;
+	u64 num_bytes = ins->offset;
+	u32 sizes[2];
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item *extent_item;
-	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_ref *ref;
 	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	int type;
-	u32 size;
-
-	if (parent > 0)
-		type = BTRFS_SHARED_DATA_REF_KEY;
-	else
-		type = BTRFS_EXTENT_DATA_REF_KEY;
-
-	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	path->leave_spinning = 1;
-	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-				      ins, size);
-	BUG_ON(ret);
+	struct btrfs_key keys[2];
 
-	leaf = path->nodes[0];
-	extent_item = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
-	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
-	btrfs_set_extent_flags(leaf, extent_item,
-			       flags | BTRFS_EXTENT_FLAG_DATA);
-
-	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-	btrfs_set_extent_inline_ref_type(leaf, iref, type);
-	if (parent > 0) {
-		struct btrfs_shared_data_ref *ref;
-		ref = (struct btrfs_shared_data_ref *)(iref + 1);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
-		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
-	} else {
-		struct btrfs_extent_data_ref *ref;
-		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
-		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
-		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
-		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
-		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
-	}
+	if (parent == 0)
+		parent = ins->objectid;
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	btrfs_free_path(path);
+	/* block accounting for super block */
+	spin_lock(&info->delalloc_lock);
+	super_used = btrfs_super_bytes_used(&info->super_copy);
+	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset,
-				 1, 0);
-	if (ret) {
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
-		BUG();
-	}
-	return ret;
-}
+	/* block accounting for root item */
+	root_used = btrfs_root_used(&root->root_item);
+	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+	spin_unlock(&info->delalloc_lock);
 
-static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     u64 parent, u64 root_objectid,
-				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins)
-{
-	int ret;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_extent_item *extent_item;
-	struct btrfs_tree_block_info *block_info;
-	struct btrfs_extent_inline_ref *iref;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+	memcpy(&keys[0], ins, sizeof(*ins));
+	keys[1].objectid = ins->objectid;
+	keys[1].type = BTRFS_EXTENT_REF_KEY;
+	keys[1].offset = parent;
+	sizes[0] = sizeof(*extent_item);
+	sizes[1] = sizeof(*ref);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	path->leave_spinning = 1;
-	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-				      ins, size);
+	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+				       sizes, 2);
 	BUG_ON(ret);
 
-	leaf = path->nodes[0];
-	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(leaf, extent_item, 1);
-	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
-	btrfs_set_extent_flags(leaf, extent_item,
-			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
-	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
-
-	btrfs_set_tree_block_key(leaf, block_info, key);
-	btrfs_set_tree_block_level(leaf, block_info, level);
-
-	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
-	if (parent > 0) {
-		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
-		btrfs_set_extent_inline_ref_type(leaf, iref,
-						 BTRFS_SHARED_BLOCK_REF_KEY);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
-	} else {
-		btrfs_set_extent_inline_ref_type(leaf, iref,
-						 BTRFS_TREE_BLOCK_REF_KEY);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
-	}
+	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+			     struct btrfs_extent_ref);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
+	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	trans->alloc_exclude_start = 0;
+	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset,
-				 1, 0);
+	if (ret)
+		goto out;
+
+	ret = update_block_group(trans, root, ins->objectid,
+				 ins->offset, 1, 0);
 	if (ret) {
 		printk(KERN_ERR "btrfs update block group failed for %llu "
 		       "%llu\n", (unsigned long long)ins->objectid,
 		       (unsigned long long)ins->offset);
 		BUG();
 	}
+out:
 	return ret;
 }
 
-int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     u64 root_objectid, u64 owner,
-				     u64 offset, struct btrfs_key *ins)
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 
-	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+		return 0;
 
-	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-					 0, root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+	ret = btrfs_add_delayed_ref(trans, ins->objectid,
+				    ins->offset, parent, root_objectid,
+				    ref_generation, owner,
+				    BTRFS_ADD_DELAYED_EXTENT, 0);
+	BUG_ON(ret);
 	return ret;
 }
 
@@ -3998,10 +3070,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  * an extent has been allocated and makes sure to clear the free
  * space cache bits as well
  */
-int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   u64 root_objectid, u64 owner, u64 offset,
-				   struct btrfs_key *ins)
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
@@ -4015,8 +3087,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				      ins->offset);
 	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
-	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
-					 0, owner, offset, ins, 1);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins, 1);
 	return ret;
 }
 
@@ -4027,48 +3099,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    u64 num_bytes, u64 parent, u64 root_objectid,
-			    struct btrfs_disk_key *key, int level,
-			    u64 empty_size, u64 hint_byte, u64 search_end,
-			    struct btrfs_key *ins)
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 parent, u64 min_alloc_size,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
-	u64 flags = 0;
-
-	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-				     empty_size, hint_byte, search_end,
-				     ins, 0);
+	ret = __btrfs_reserve_extent(trans, root, num_bytes,
+				     min_alloc_size, empty_size, hint_byte,
+				     search_end, ins, data);
 	BUG_ON(ret);
-
-	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		if (parent == 0)
-			parent = ins->objectid;
-		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-	} else
-		BUG_ON(parent > 0);
-
-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		struct btrfs_delayed_extent_op *extent_op;
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
-		if (key)
-			memcpy(&extent_op->key, key, sizeof(extent_op->key));
-		else
-			memset(&extent_op->key, 0, sizeof(extent_op->key));
-		extent_op->flags_to_set = flags;
-		extent_op->update_key = 1;
-		extent_op->update_flags = 1;
-		extent_op->is_data = 0;
-
-		ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-					ins->offset, parent, root_objectid,
-					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op);
+		ret = btrfs_add_delayed_ref(trans, ins->objectid,
+					    ins->offset, parent, root_objectid,
+					    ref_generation, owner_objectid,
+					    BTRFS_ADD_DELAYED_EXTENT, 0);
 		BUG_ON(ret);
 	}
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	return ret;
 }
 
@@ -4107,17 +3157,21 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root, u32 blocksize,
-					u64 parent, u64 root_objectid,
-					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size)
+					     struct btrfs_root *root,
+					     u32 blocksize, u64 parent,
+					     u64 root_objectid,
+					     u64 ref_generation,
+					     int level,
+					     u64 hint,
+					     u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-			       key, level, empty_size, hint, (u64)-1, &ins);
+	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+				 root_objectid, ref_generation, level,
+				 empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -4131,19 +3185,32 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf)
 {
-	u64 disk_bytenr;
-	u64 num_bytes;
+	u64 leaf_owner;
+	u64 leaf_generation;
+	struct refsort *sorted;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
-	u32 nritems;
 	int i;
+	int nritems;
 	int ret;
+	int refi = 0;
+	int slot;
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
+	leaf_owner = btrfs_header_owner(leaf);
+	leaf_generation = btrfs_header_generation(leaf);
 
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+	/* we do this loop twice.  The first time we build a list
+	 * of the extents we have a reference on, then we sort the list
+	 * by bytenr.  The second time around we actually do the
+	 * extent freeing.
+	 */
 	for (i = 0; i < nritems; i++) {
+		u64 disk_bytenr;
 		cond_resched();
+
 		btrfs_item_key_to_cpu(leaf, &key, i);
 
 		/* only extents have references, skip everything else */
@@ -4163,16 +3230,45 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 
-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
-					leaf->start, 0, key.objectid, 0);
+		sorted[refi].bytenr = disk_bytenr;
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		u64 disk_bytenr;
+
+		disk_bytenr = sorted[i].bytenr;
+		slot = sorted[i].slot;
+
+		cond_resched();
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+		ret = btrfs_free_extent(trans, root, disk_bytenr,
+				btrfs_file_extent_disk_num_bytes(leaf, fi),
+				leaf->start, leaf_owner, leaf_generation,
+				key.objectid, 0);
 		BUG_ON(ret);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
 	}
+out:
+	kfree(sorted);
 	return 0;
 }
 
-#if 0
-
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_leaf_ref *ref)
@@ -4215,14 +3311,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-
 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 start,
 				     u64 len, u32 *refs)
 {
 	int ret;
 
-	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
+	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -4257,7 +3352,6 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-
 /*
  * this is used while deleting old snapshots, and it drops the refs
  * on a whole subtree starting from a level 1 node.
@@ -4551,36 +3645,32 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	cond_resched();
 	return 0;
 }
-#endif
 
 /*
  * helper function for drop_subtree, this function is similar to
  * walk_down_tree. The main difference is that it checks reference
  * counts while tree blocks are locked.
  */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path, int *level)
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path, int *level)
 {
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
 	u64 bytenr;
 	u64 ptr_gen;
-	u64 refs;
-	u64 flags;
 	u32 blocksize;
+	u32 refs;
 	int ret;
 
 	cur = path->nodes[*level];
-	ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
-				       &refs, &flags);
+	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+				      &refs);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
 
-	BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
-
 	while (*level >= 0) {
 		cur = path->nodes[*level];
 		if (*level == 0) {
@@ -4602,15 +3692,16 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(next);
 		btrfs_set_lock_blocking(next);
 
-		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-					       &refs, &flags);
+		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+					      &refs);
 		BUG_ON(ret);
 		if (refs > 1) {
 			parent = path->nodes[*level];
 			ret = btrfs_free_extent(trans, root, bytenr,
-						blocksize, parent->start,
-						btrfs_header_owner(parent),
-						*level - 1, 0);
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					*level - 1, 1);
 			BUG_ON(ret);
 			path->slots[*level]++;
 			btrfs_tree_unlock(next);
@@ -4618,8 +3709,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
-		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
-
 		*level = btrfs_header_level(next);
 		path->nodes[*level] = next;
 		path->slots[*level] = 0;
@@ -4627,15 +3716,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 out:
-	if (path->nodes[*level] == root->node)
-		parent = path->nodes[*level];
-	else
-		parent = path->nodes[*level + 1];
+	parent = path->nodes[*level + 1];
 	bytenr = path->nodes[*level]->start;
 	blocksize = path->nodes[*level]->len;
 
-	ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
-				btrfs_header_owner(parent), *level, 0);
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+			parent->start, btrfs_header_owner(parent),
+			btrfs_header_generation(parent), *level, 1);
 	BUG_ON(ret);
 
 	if (path->locks[*level]) {
@@ -4659,6 +3746,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 int *level, int max_level)
 {
+	u64 root_owner;
+	u64 root_gen;
 	struct btrfs_root_item *root_item = &root->root_item;
 	int i;
 	int slot;
@@ -4666,22 +3755,24 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 
 	for (i = *level; i < max_level && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			struct btrfs_disk_key disk_key;
+
 			/*
 			 * there is more work to do in this level.
 			 * Update the drop_progress marker to reflect
 			 * the work we've done so far, and then bump
 			 * the slot number
 			 */
+			node = path->nodes[i];
 			path->slots[i]++;
-			WARN_ON(*level == 0);
-			if (max_level == BTRFS_MAX_LEVEL) {
-				btrfs_node_key(path->nodes[i],
-					       &root_item->drop_progress,
-					       path->slots[i]);
-				root_item->drop_level = i;
-			}
 			*level = i;
+			WARN_ON(*level == 0);
+			btrfs_node_key(node, &disk_key, path->slots[i]);
+			memcpy(&root_item->drop_progress,
+			       &disk_key, sizeof(disk_key));
+			root_item->drop_level = i;
 			return 0;
 		} else {
 			struct extent_buffer *parent;
@@ -4695,20 +3786,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 			else
 				parent = path->nodes[*level + 1];
 
-			clean_tree_block(trans, root, path->nodes[i]);
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
+
+			clean_tree_block(trans, root, path->nodes[*level]);
 			ret = btrfs_free_extent(trans, root,
-						path->nodes[i]->start,
-						path->nodes[i]->len,
-						parent->start,
-						btrfs_header_owner(parent),
-						*level, 0);
+						path->nodes[*level]->start,
+						path->nodes[*level]->len,
+						parent->start, root_owner,
+						root_gen, *level, 1);
 			BUG_ON(ret);
 			if (path->locks[*level]) {
-				btrfs_tree_unlock(path->nodes[i]);
-				path->locks[i] = 0;
+				btrfs_tree_unlock(path->nodes[*level]);
+				path->locks[*level] = 0;
 			}
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
 			*level = i + 1;
 		}
 	}
@@ -4727,18 +3820,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int wret;
 	int level;
 	struct btrfs_path *path;
+	int i;
+	int orig_level;
 	int update_count;
 	struct btrfs_root_item *root_item = &root->root_item;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	level = btrfs_header_level(root->node);
+	orig_level = level;
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-		path->nodes[level] = btrfs_lock_root_node(root);
-		btrfs_set_lock_blocking(path->nodes[level]);
+		path->nodes[level] = root->node;
+		extent_buffer_get(root->node);
 		path->slots[level] = 0;
-		path->locks[level] = 1;
 	} else {
 		struct btrfs_key key;
 		struct btrfs_disk_key found_key;
@@ -4760,7 +3856,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		 * unlock our path, this is safe because only this
 		 * function is allowed to delete this snapshot
 		 */
-		btrfs_unlock_up_safe(path, 0);
+		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+			if (path->nodes[i] && path->locks[i]) {
+				path->locks[i] = 0;
+				btrfs_tree_unlock(path->nodes[i]);
+			}
+		}
 	}
 	while (1) {
 		unsigned long update;
@@ -4781,6 +3882,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = -EAGAIN;
 			break;
 		}
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
 		for (update_count = 0; update_count < 16; update_count++) {
 			update = trans->delayed_ref_updates;
 			trans->delayed_ref_updates = 0;
@@ -4790,6 +3893,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 				break;
 		}
 	}
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -4822,7 +3931,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path->slots[level] = 0;
 
 	while (1) {
-		wret = walk_down_tree(trans, root, path, &level);
+		wret = walk_down_subtree(trans, root, path, &level);
 		if (wret < 0)
 			ret = wret;
 		if (wret != 0)
@@ -4839,7 +3948,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-#if 0
 static unsigned long calc_ra(unsigned long start, unsigned long last,
 			     unsigned long nr)
 {
@@ -6321,7 +5429,6 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 	kfree(ref_path);
 	return ret;
 }
-#endif
 
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
@@ -6370,8 +5477,7 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	u64 calc;
 
 	spin_lock(&shrink_block_group->lock);
-	if (btrfs_block_group_used(&shrink_block_group->item) +
-	    shrink_block_group->reserved > 0) {
+	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
 		spin_unlock(&shrink_block_group->lock);
 
 		trans = btrfs_start_transaction(root, 1);
@@ -6396,17 +5502,6 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	return 0;
 }
 
-
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-					 struct btrfs_block_group_cache *group)
-
-{
-	__alloc_chunk_for_shrink(root, group, 1);
-	set_block_group_readonly(group);
-	return 0;
-}
-
-#if 0
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 u64 objectid, u64 size)
@@ -6686,7 +5781,6 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	btrfs_free_path(path);
 	return ret;
 }
-#endif
 
 static int find_first_block_group(struct btrfs_root *root,
 		struct btrfs_path *path, struct btrfs_key *key)
diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c
index 68260180f587..fe9eb990e443 100644
--- a/trunk/fs/btrfs/extent_io.c
+++ b/trunk/fs/btrfs/extent_io.c
@@ -476,7 +476,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
-	u64 last_end;
 	int err;
 	int set = 0;
 
@@ -499,7 +498,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	if (state->start > end)
 		goto out;
 	WARN_ON(state->end < start);
-	last_end = state->end;
 
 	/*
 	 *     | ---- desired range ---- |
@@ -526,11 +524,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		if (err)
 			goto out;
 		if (state->end <= end) {
+			start = state->end + 1;
 			set |= clear_state_bit(tree, state, bits,
 					wake, delete);
-			if (last_end == (u64)-1)
-				goto out;
-			start = last_end + 1;
 		} else {
 			start = state->start;
 		}
@@ -556,10 +552,8 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		goto out;
 	}
 
+	start = state->end + 1;
 	set |= clear_state_bit(tree, state, bits, wake, delete);
-	if (last_end == (u64)-1)
-		goto out;
-	start = last_end + 1;
 	goto search_again;
 
 out:
@@ -713,10 +707,8 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			goto out;
 		}
 		set_state_bits(tree, state, bits);
+		start = state->end + 1;
 		merge_state(tree, state);
-		if (last_end == (u64)-1)
-			goto out;
-		start = last_end + 1;
 		goto search_again;
 	}
 
@@ -750,10 +742,8 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, bits);
+			start = state->end + 1;
 			merge_state(tree, state);
-			if (last_end == (u64)-1)
-				goto out;
-			start = last_end + 1;
 		} else {
 			start = state->start;
 		}
diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c
index 126477eaecf5..1d51dc38bb49 100644
--- a/trunk/fs/btrfs/file.c
+++ b/trunk/fs/btrfs/file.c
@@ -291,12 +291,16 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
+	u64 leaf_start;
 	u64 ram_bytes = 0;
+	u64 orig_parent = 0;
 	u64 disk_bytenr = 0;
 	u64 orig_locked_end = locked_end;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
+	u64 root_gen;
+	u64 root_owner;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_path *path;
@@ -336,6 +340,9 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		bookend = 0;
 		found_extent = 0;
 		found_inline = 0;
+		leaf_start = 0;
+		root_gen = 0;
+		root_owner = 0;
 		compression = 0;
 		encryption = 0;
 		extent = NULL;
@@ -410,6 +417,9 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		if (found_extent) {
 			read_extent_buffer(leaf, &old, (unsigned long)extent,
 					   sizeof(old));
+			root_gen = btrfs_header_generation(leaf);
+			root_owner = btrfs_header_owner(leaf);
+			leaf_start = leaf->start;
 		}
 
 		if (end < extent_end && end >= key.offset) {
@@ -433,14 +443,14 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				}
 				locked_end = extent_end;
 			}
+			orig_parent = path->nodes[0]->start;
 			disk_bytenr = le64_to_cpu(old.disk_bytenr);
 			if (disk_bytenr != 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 					   disk_bytenr,
-					   le64_to_cpu(old.disk_num_bytes), 0,
-					   root->root_key.objectid,
-					   key.objectid, key.offset -
-					   le64_to_cpu(old.offset));
+					   le64_to_cpu(old.disk_num_bytes),
+					   orig_parent, root->root_key.objectid,
+					   trans->transid, inode->i_ino);
 				BUG_ON(ret);
 			}
 		}
@@ -558,6 +568,17 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 			btrfs_set_lock_blocking(path->nodes[0]);
 
+			if (disk_bytenr != 0) {
+				ret = btrfs_update_extent_ref(trans, root,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						orig_parent,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid, ins.objectid);
+
+				BUG_ON(ret);
+			}
 			path->leave_spinning = 0;
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0)
@@ -573,9 +594,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				ret = btrfs_free_extent(trans, root,
 						old_disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
-						0, root->root_key.objectid,
-						key.objectid, key.offset -
-						le64_to_cpu(old.offset));
+						leaf_start, root_owner,
+						root_gen, key.objectid, 0);
 				BUG_ON(ret);
 				*hint_byte = old_disk_bytenr;
 			}
@@ -644,11 +664,12 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 num_bytes;
 	u64 extent_end;
-	u64 orig_offset;
+	u64 extent_offset;
 	u64 other_start;
 	u64 other_end;
 	u64 split = start;
 	u64 locked_end = end;
+	u64 orig_parent;
 	int extent_type;
 	int split_end = 1;
 	int ret;
@@ -682,7 +703,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 
 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+	extent_offset = btrfs_file_extent_offset(leaf, fi);
 
 	if (key.offset == start)
 		split = end;
@@ -690,6 +711,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	if (key.offset == start && extent_end == end) {
 		int del_nr = 0;
 		int del_slot = 0;
+		u64 leaf_owner = btrfs_header_owner(leaf);
+		u64 leaf_gen = btrfs_header_generation(leaf);
 		other_start = end;
 		other_end = 0;
 		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -698,8 +721,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			del_slot = path->slots[0] + 1;
 			del_nr++;
 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						0, root->root_key.objectid,
-						inode->i_ino, orig_offset);
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
 			BUG_ON(ret);
 		}
 		other_start = 0;
@@ -710,8 +733,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			del_slot = path->slots[0];
 			del_nr++;
 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						0, root->root_key.objectid,
-						inode->i_ino, orig_offset);
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
 			BUG_ON(ret);
 		}
 		split_end = 0;
@@ -745,12 +768,13 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			locked_end = extent_end;
 		}
 		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+		extent_offset += split - key.offset;
 	} else  {
 		BUG_ON(key.offset != start);
-		key.offset = split;
-		btrfs_set_file_extent_offset(leaf, fi, key.offset -
-					     orig_offset);
+		btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+					     split - key.offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+		key.offset = split;
 		btrfs_set_item_key_safe(trans, root, path, &key);
 		extent_end = split;
 	}
@@ -769,8 +793,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 					    struct btrfs_file_extent_item);
 			key.offset = split;
 			btrfs_set_item_key_safe(trans, root, path, &key);
-			btrfs_set_file_extent_offset(leaf, fi, key.offset -
-						     orig_offset);
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - split);
 			goto done;
@@ -792,9 +815,10 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 
 	btrfs_mark_buffer_dirty(leaf);
 
-	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
-				   root->root_key.objectid,
-				   inode->i_ino, orig_offset);
+	orig_parent = leaf->start;
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				   orig_parent, root->root_key.objectid,
+				   trans->transid, inode->i_ino);
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
@@ -809,12 +833,20 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_type(leaf, fi, extent_type);
 	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-	btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
+	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
 	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 	btrfs_set_file_extent_compression(leaf, fi, 0);
 	btrfs_set_file_extent_encryption(leaf, fi, 0);
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+
+	if (orig_parent != leaf->start) {
+		ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+					      orig_parent, leaf->start,
+					      root->root_key.objectid,
+					      trans->transid, inode->i_ino);
+		BUG_ON(ret);
+	}
 done:
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1157,8 +1189,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	root->log_batch++;
 
-	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
-		goto out;
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
diff --git a/trunk/fs/btrfs/free-space-cache.c b/trunk/fs/btrfs/free-space-cache.c
index 4538e48581a5..0bc93657b460 100644
--- a/trunk/fs/btrfs/free-space-cache.c
+++ b/trunk/fs/btrfs/free-space-cache.c
@@ -579,7 +579,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
  * it returns -enospc
  */
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
@@ -596,9 +595,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	int ret;
 
 	/* for metadata, allow allocates with more holes */
-	if (btrfs_test_opt(root, SSD_SPREAD)) {
-		min_bytes = bytes + empty_size;
-	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+	if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
 		/*
 		 * we want to do larger allocations when we are
 		 * flushing out the delayed refs, it helps prevent
@@ -648,15 +645,14 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		 * we haven't filled the empty size and the window is
 		 * very large.  reset and try again
 		 */
-		if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
-		    next->offset - window_start > (bytes + empty_size) * 2) {
+		if (next->offset - window_start > (bytes + empty_size) * 2) {
 			entry = next;
 			window_start = entry->offset;
 			window_free = entry->bytes;
 			last = entry;
 			max_extent = 0;
 			total_retries++;
-			if (total_retries % 64 == 0) {
+			if (total_retries % 256 == 0) {
 				if (min_bytes >= (bytes + empty_size)) {
 					ret = -ENOSPC;
 					goto out;
diff --git a/trunk/fs/btrfs/free-space-cache.h b/trunk/fs/btrfs/free-space-cache.h
index 266fb8764054..ab0bdc0a63ce 100644
--- a/trunk/fs/btrfs/free-space-cache.h
+++ b/trunk/fs/btrfs/free-space-cache.h
@@ -31,7 +31,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
diff --git a/trunk/fs/btrfs/hash.h b/trunk/fs/btrfs/hash.h
index db2ff9773b99..2a020b276768 100644
--- a/trunk/fs/btrfs/hash.h
+++ b/trunk/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
 #ifndef __HASH__
 #define __HASH__
 
-#include <linux/crc32c.h>
+#include "crc32c.h"
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-	return crc32c((u32)~1, name, len);
+	return btrfs_crc32c((u32)~1, name, len);
 }
 #endif
diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c
index 8612b3a09811..1c8b0190d031 100644
--- a/trunk/fs/btrfs/inode.c
+++ b/trunk/fs/btrfs/inode.c
@@ -48,6 +48,7 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
+#include "ref-cache.h"
 #include "compression.h"
 #include "locking.h"
 
@@ -368,7 +369,7 @@ static noinline int compress_file_range(struct inode *inode,
 	 * inode has not been flagged as nocompress.  This flag can
 	 * change at any time if we discover bad compression ratios.
 	 */
-	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
+	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -469,7 +470,7 @@ static noinline int compress_file_range(struct inode *inode,
 		nr_pages_ret = 0;
 
 		/* flag the file so we don't compress in the future */
-		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		btrfs_set_flag(inode, NOCOMPRESS);
 	}
 	if (will_compress) {
 		*num_added += 1;
@@ -862,7 +863,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
 
-		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+		if (btrfs_test_flag(inode, NOCOMPRESS))
 			cur_end = end;
 		else
 			cur_end = min(end, start + 512 * 1024 - 1);
@@ -943,7 +944,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	u64 cow_start;
 	u64 cur_offset;
 	u64 extent_end;
-	u64 extent_offset;
 	u64 disk_bytenr;
 	u64 num_bytes;
 	int extent_type;
@@ -1005,7 +1005,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
 			if (extent_end <= start) {
@@ -1023,10 +1022,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-						  found_key.offset -
-						  extent_offset, disk_bytenr))
+						  disk_bytenr))
 				goto out_check;
-			disk_bytenr += extent_offset;
+			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
 			disk_bytenr += cur_offset - found_key.offset;
 			num_bytes = min(end + 1, extent_end) - cur_offset;
 			/*
@@ -1133,10 +1131,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 1, nr_written);
-	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
 	else if (!btrfs_test_opt(root, COMPRESS))
@@ -1290,7 +1288,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	int ret = 0;
 	int skip_sum;
 
-	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	skip_sum = btrfs_test_flag(inode, NODATASUM);
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
@@ -1491,9 +1489,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_alloc_reserved_file_extent(trans, root,
-					root->root_key.objectid,
-					inode->i_ino, file_pos, &ins);
+	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+					  root->root_key.objectid,
+					  trans->transid, inode->i_ino, &ins);
 	BUG_ON(ret);
 	btrfs_free_path(path);
 
@@ -1790,8 +1788,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ClearPageChecked(page);
 		goto good;
 	}
-
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+	if (btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -1959,13 +1956,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
-		found_key.objectid = found_key.offset;
-		found_key.type = BTRFS_INODE_ITEM_KEY;
-		found_key.offset = 0;
-		inode = btrfs_iget(root->fs_info->sb, &found_key, root);
-		if (IS_ERR(inode))
+		inode = btrfs_iget_locked(root->fs_info->sb,
+					  found_key.offset, root);
+		if (!inode)
 			break;
 
+		if (inode->i_state & I_NEW) {
+			BTRFS_I(inode)->root = root;
+
+			/* have to set the location manually */
+			BTRFS_I(inode)->location.objectid = inode->i_ino;
+			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+			BTRFS_I(inode)->location.offset = 0;
+
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
+
 		/*
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
@@ -2062,7 +2069,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
  * read an inode from the btree into the in-memory inode
  */
-static void btrfs_read_locked_inode(struct inode *inode)
+void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -2157,8 +2164,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 	}
-
-	btrfs_update_iflags(inode);
 	return;
 
 make_bad:
@@ -2322,6 +2327,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	btrfs_update_inode(trans, root, dir);
 	btrfs_drop_nlink(inode);
 	ret = btrfs_update_inode(trans, root, inode);
+	dir->i_sb->s_dirt = 1;
 out:
 	return ret;
 }
@@ -2593,8 +2599,9 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
 	u64 extent_num_bytes = 0;
-	u64 extent_offset = 0;
 	u64 item_end = 0;
+	u64 root_gen = 0;
+	u64 root_owner = 0;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
@@ -2709,9 +2716,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf,
 									 fi);
-				extent_offset = found_key.offset -
-					btrfs_file_extent_offset(leaf, fi);
-
 				/* FIXME blocksize != 4096 */
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
@@ -2719,6 +2723,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 					if (root->ref_cows)
 						inode_sub_bytes(inode, num_dec);
 				}
+				root_gen = btrfs_header_generation(leaf);
+				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			/*
@@ -2762,12 +2768,12 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		} else {
 			break;
 		}
-		if (found_extent && root->ref_cows) {
+		if (found_extent) {
 			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_bytes, 0,
-						btrfs_header_owner(leaf),
-						inode->i_ino, extent_offset);
+						extent_num_bytes,
+						leaf->start, root_owner,
+						root_gen, inode->i_ino, 0);
 			BUG_ON(ret);
 		}
 next:
@@ -2805,6 +2811,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				      pending_del_nr);
 	}
 	btrfs_free_path(path);
+	inode->i_sb->s_dirt = 1;
 	return ret;
 }
 
@@ -3098,45 +3105,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
-static void inode_tree_add(struct inode *inode)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_inode *entry;
-	struct rb_node **p = &root->inode_tree.rb_node;
-	struct rb_node *parent = NULL;
-
-	spin_lock(&root->inode_lock);
-	while (*p) {
-		parent = *p;
-		entry = rb_entry(parent, struct btrfs_inode, rb_node);
-
-		if (inode->i_ino < entry->vfs_inode.i_ino)
-			p = &(*p)->rb_left;
-		else if (inode->i_ino > entry->vfs_inode.i_ino)
-			p = &(*p)->rb_right;
-		else {
-			WARN_ON(!(entry->vfs_inode.i_state &
-				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
-			break;
-		}
-	}
-	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
-	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
-	spin_unlock(&root->inode_lock);
-}
-
-static void inode_tree_del(struct inode *inode)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
-		spin_lock(&root->inode_lock);
-		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
-		spin_unlock(&root->inode_lock);
-		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-	}
-}
-
 static noinline void init_btrfs_i(struct inode *inode)
 {
 	struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3162,7 +3130,6 @@ static noinline void init_btrfs_i(struct inode *inode)
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3185,9 +3152,26 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root;
 }
 
-static struct inode *btrfs_iget_locked(struct super_block *s,
-				       u64 objectid,
-				       struct btrfs_root *root)
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	if (wait) {
+		inode = ilookup5(s, objectid, btrfs_find_actor,
+				 (void *)&args);
+	} else {
+		inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+					(void *)&args);
+	}
+	return inode;
+}
+
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
@@ -3204,21 +3188,24 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root)
+			 struct btrfs_root *root, int *is_new)
 {
 	struct inode *inode;
 
 	inode = btrfs_iget_locked(s, location->objectid, root);
 	if (!inode)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(-EACCES);
 
 	if (inode->i_state & I_NEW) {
 		BTRFS_I(inode)->root = root;
 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
-
-		inode_tree_add(inode);
 		unlock_new_inode(inode);
+		if (is_new)
+			*is_new = 1;
+	} else {
+		if (is_new)
+			*is_new = 0;
 	}
 
 	return inode;
@@ -3231,7 +3218,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret;
+	int ret, new;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -3249,7 +3236,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
-		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
@@ -3587,9 +3574,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			btrfs_find_block_group(root, 0, alloc_hint, owner);
 	if ((mode & S_IFREG)) {
 		if (btrfs_test_opt(root, NODATASUM))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+			btrfs_set_flag(inode, NODATASUM);
 		if (btrfs_test_opt(root, NODATACOW))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+			btrfs_set_flag(inode, NODATACOW);
 	}
 
 	key[0].objectid = objectid;
@@ -3643,10 +3630,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	location->offset = 0;
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
-	btrfs_inherit_iflags(inode, dir);
-
 	insert_inode_hash(inode);
-	inode_tree_add(inode);
 	return inode;
 fail:
 	if (dir)
@@ -3766,6 +3750,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		init_special_inode(inode, inode->i_mode, rdev);
 		btrfs_update_inode(trans, root, inode);
 	}
+	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3830,6 +3815,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
+	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3876,6 +3862,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		drop_inode = 1;
 
+	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, dir);
 	err = btrfs_update_inode(trans, root, inode);
 
@@ -3957,6 +3944,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
+	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 
@@ -4695,7 +4683,6 @@ void btrfs_destroy_inode(struct inode *inode)
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
-	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -4985,6 +4972,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
+	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 	if (drop_inode)
@@ -5073,7 +5061,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 out:
 	if (cur_offset > start) {
 		inode->i_ctime = CURRENT_TIME;
-		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+		btrfs_set_flag(inode, PREALLOC);
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 		    cur_offset > i_size_read(inode))
 			btrfs_i_size_write(inode, cur_offset);
@@ -5194,7 +5182,7 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask)
 {
-	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
+	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
 	return generic_permission(inode, mask, btrfs_check_acl);
 }
diff --git a/trunk/fs/btrfs/ioctl.c b/trunk/fs/btrfs/ioctl.c
index eff18f5b5362..2624b53ea783 100644
--- a/trunk/fs/btrfs/ioctl.c
+++ b/trunk/fs/btrfs/ioctl.c
@@ -50,177 +50,7 @@
 #include "volumes.h"
 #include "locking.h"
 
-/* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
-{
-	if (S_ISDIR(mode))
-		return flags;
-	else if (S_ISREG(mode))
-		return flags & ~FS_DIRSYNC_FL;
-	else
-		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
-}
-
-/*
- * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
- */
-static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
-{
-	unsigned int iflags = 0;
-
-	if (flags & BTRFS_INODE_SYNC)
-		iflags |= FS_SYNC_FL;
-	if (flags & BTRFS_INODE_IMMUTABLE)
-		iflags |= FS_IMMUTABLE_FL;
-	if (flags & BTRFS_INODE_APPEND)
-		iflags |= FS_APPEND_FL;
-	if (flags & BTRFS_INODE_NODUMP)
-		iflags |= FS_NODUMP_FL;
-	if (flags & BTRFS_INODE_NOATIME)
-		iflags |= FS_NOATIME_FL;
-	if (flags & BTRFS_INODE_DIRSYNC)
-		iflags |= FS_DIRSYNC_FL;
-
-	return iflags;
-}
-
-/*
- * Update inode->i_flags based on the btrfs internal flags.
- */
-void btrfs_update_iflags(struct inode *inode)
-{
-	struct btrfs_inode *ip = BTRFS_I(inode);
-
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-
-	if (ip->flags & BTRFS_INODE_SYNC)
-		inode->i_flags |= S_SYNC;
-	if (ip->flags & BTRFS_INODE_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	if (ip->flags & BTRFS_INODE_APPEND)
-		inode->i_flags |= S_APPEND;
-	if (ip->flags & BTRFS_INODE_NOATIME)
-		inode->i_flags |= S_NOATIME;
-	if (ip->flags & BTRFS_INODE_DIRSYNC)
-		inode->i_flags |= S_DIRSYNC;
-}
-
-/*
- * Inherit flags from the parent inode.
- *
- * Unlike extN we don't have any flags we don't want to inherit currently.
- */
-void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
-{
-	unsigned int flags;
-
-	if (!dir)
-		return;
-
-	flags = BTRFS_I(dir)->flags;
-
-	if (S_ISREG(inode->i_mode))
-		flags &= ~BTRFS_INODE_DIRSYNC;
-	else if (!S_ISDIR(inode->i_mode))
-		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
-
-	BTRFS_I(inode)->flags = flags;
-	btrfs_update_iflags(inode);
-}
-
-static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
-{
-	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
-	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
-
-	if (copy_to_user(arg, &flags, sizeof(flags)))
-		return -EFAULT;
-	return 0;
-}
-
-static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_inode *ip = BTRFS_I(inode);
-	struct btrfs_root *root = ip->root;
-	struct btrfs_trans_handle *trans;
-	unsigned int flags, oldflags;
-	int ret;
-
-	if (copy_from_user(&flags, arg, sizeof(flags)))
-		return -EFAULT;
-
-	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-		      FS_NOATIME_FL | FS_NODUMP_FL | \
-		      FS_SYNC_FL | FS_DIRSYNC_FL))
-		return -EOPNOTSUPP;
 
-	if (!is_owner_or_cap(inode))
-		return -EACCES;
-
-	mutex_lock(&inode->i_mutex);
-
-	flags = btrfs_mask_flags(inode->i_mode, flags);
-	oldflags = btrfs_flags_to_ioctl(ip->flags);
-	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			ret = -EPERM;
-			goto out_unlock;
-		}
-	}
-
-	ret = mnt_want_write(file->f_path.mnt);
-	if (ret)
-		goto out_unlock;
-
-	if (flags & FS_SYNC_FL)
-		ip->flags |= BTRFS_INODE_SYNC;
-	else
-		ip->flags &= ~BTRFS_INODE_SYNC;
-	if (flags & FS_IMMUTABLE_FL)
-		ip->flags |= BTRFS_INODE_IMMUTABLE;
-	else
-		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
-	if (flags & FS_APPEND_FL)
-		ip->flags |= BTRFS_INODE_APPEND;
-	else
-		ip->flags &= ~BTRFS_INODE_APPEND;
-	if (flags & FS_NODUMP_FL)
-		ip->flags |= BTRFS_INODE_NODUMP;
-	else
-		ip->flags &= ~BTRFS_INODE_NODUMP;
-	if (flags & FS_NOATIME_FL)
-		ip->flags |= BTRFS_INODE_NOATIME;
-	else
-		ip->flags &= ~BTRFS_INODE_NOATIME;
-	if (flags & FS_DIRSYNC_FL)
-		ip->flags |= BTRFS_INODE_DIRSYNC;
-	else
-		ip->flags &= ~BTRFS_INODE_DIRSYNC;
-
-
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
-
-	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
-
-	btrfs_update_iflags(inode);
-	inode->i_ctime = CURRENT_TIME;
-	btrfs_end_transaction(trans, root);
-
-	mnt_drop_write(file->f_path.mnt);
- out_unlock:
-	mutex_unlock(&inode->i_mutex);
-	return 0;
-}
-
-static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-
-	return put_user(inode->i_generation, arg);
-}
 
 static noinline int create_subvol(struct btrfs_root *root,
 				  struct dentry *dentry,
@@ -252,25 +82,22 @@ static noinline int create_subvol(struct btrfs_root *root,
 	if (ret)
 		goto fail;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      0, objectid, NULL, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      objectid, trans->transid, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
 	}
 
-	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
 	btrfs_set_header_bytenr(leaf, leaf->start);
 	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
 	write_extent_buffer(leaf, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
-	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
-			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_item = &root_item.inode;
@@ -298,7 +125,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_root_dirid(&root_item, new_dirid);
 
 	key.objectid = objectid;
-	key.offset = 0;
+	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
@@ -1084,10 +911,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
-							disko, diskl, 0,
-							root->root_key.objectid,
-							inode->i_ino,
-							new_key.offset - datao);
+						   disko, diskl, leaf->start,
+						   root->root_key.objectid,
+						   trans->transid,
+						   inode->i_ino);
 					BUG_ON(ret);
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1247,12 +1074,6 @@ long btrfs_ioctl(struct file *file, unsigned int
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
-	case FS_IOC_GETFLAGS:
-		return btrfs_ioctl_getflags(file, argp);
-	case FS_IOC_SETFLAGS:
-		return btrfs_ioctl_setflags(file, argp);
-	case FS_IOC_GETVERSION:
-		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
diff --git a/trunk/fs/btrfs/print-tree.c b/trunk/fs/btrfs/print-tree.c
index 6d6523da0a30..5f8f218c1005 100644
--- a/trunk/fs/btrfs/print-tree.c
+++ b/trunk/fs/btrfs/print-tree.c
@@ -45,132 +45,22 @@ static void print_dev_item(struct extent_buffer *eb,
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
 }
-static void print_extent_data_ref(struct extent_buffer *eb,
-				  struct btrfs_extent_data_ref *ref)
-{
-	printk(KERN_INFO "\t\textent data backref root %llu "
-	       "objectid %llu offset %llu count %u\n",
-	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
-	       btrfs_extent_data_ref_count(eb, ref));
-}
-
-static void print_extent_item(struct extent_buffer *eb, int slot)
-{
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_inline_ref *iref;
-	struct btrfs_extent_data_ref *dref;
-	struct btrfs_shared_data_ref *sref;
-	struct btrfs_disk_key key;
-	unsigned long end;
-	unsigned long ptr;
-	int type;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
-	u64 flags;
-	u64 offset;
-
-	if (item_size < sizeof(*ei)) {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		struct btrfs_extent_item_v0 *ei0;
-		BUG_ON(item_size != sizeof(*ei0));
-		ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
-		printk(KERN_INFO "\t\textent refs %u\n",
-		       btrfs_extent_refs_v0(eb, ei0));
-		return;
-#else
-		BUG();
-#endif
-	}
-
-	ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
-	flags = btrfs_extent_flags(eb, ei);
-
-	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
-	       (unsigned long long)btrfs_extent_refs(eb, ei),
-	       (unsigned long long)btrfs_extent_generation(eb, ei),
-	       (unsigned long long)flags);
-
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		struct btrfs_tree_block_info *info;
-		info = (struct btrfs_tree_block_info *)(ei + 1);
-		btrfs_tree_block_key(eb, info, &key);
-		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
-		       "level %d\n",
-		       (unsigned long long)btrfs_disk_key_objectid(&key),
-		       key.type,
-		       (unsigned long long)btrfs_disk_key_offset(&key),
-		       btrfs_tree_block_level(eb, info));
-		iref = (struct btrfs_extent_inline_ref *)(info + 1);
-	} else {
-		iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-	}
-
-	ptr = (unsigned long)iref;
-	end = (unsigned long)ei + item_size;
-	while (ptr < end) {
-		iref = (struct btrfs_extent_inline_ref *)ptr;
-		type = btrfs_extent_inline_ref_type(eb, iref);
-		offset = btrfs_extent_inline_ref_offset(eb, iref);
-		switch (type) {
-		case BTRFS_TREE_BLOCK_REF_KEY:
-			printk(KERN_INFO "\t\ttree block backref "
-				"root %llu\n", (unsigned long long)offset);
-			break;
-		case BTRFS_SHARED_BLOCK_REF_KEY:
-			printk(KERN_INFO "\t\tshared block backref "
-				"parent %llu\n", (unsigned long long)offset);
-			break;
-		case BTRFS_EXTENT_DATA_REF_KEY:
-			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
-			print_extent_data_ref(eb, dref);
-			break;
-		case BTRFS_SHARED_DATA_REF_KEY:
-			sref = (struct btrfs_shared_data_ref *)(iref + 1);
-			printk(KERN_INFO "\t\tshared data backref "
-			       "parent %llu count %u\n",
-			       (unsigned long long)offset,
-			       btrfs_shared_data_ref_count(eb, sref));
-			break;
-		default:
-			BUG();
-		}
-		ptr += btrfs_extent_inline_ref_size(type);
-	}
-	WARN_ON(ptr > end);
-}
-
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
-{
-	struct btrfs_extent_ref_v0 *ref0;
-
-	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
-	printk("\t\textent back ref root %llu gen %llu "
-		"owner %llu num_refs %lu\n",
-		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
-		(unsigned long)btrfs_ref_count_v0(eb, ref0));
-}
-#endif
-
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
-	u32 type;
 	u32 nr = btrfs_header_nritems(l);
 	struct btrfs_item *item;
+	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
 	struct btrfs_block_group_item *bi;
 	struct btrfs_file_extent_item *fi;
-	struct btrfs_extent_data_ref *dref;
-	struct btrfs_shared_data_ref *sref;
-	struct btrfs_dev_extent *dev_extent;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_dev_extent *dev_extent;
+	u32 type;
 
 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
@@ -210,25 +100,20 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
-			print_extent_item(l, i);
-			break;
-		case BTRFS_TREE_BLOCK_REF_KEY:
-			printk(KERN_INFO "\t\ttree block backref\n");
-			break;
-		case BTRFS_SHARED_BLOCK_REF_KEY:
-			printk(KERN_INFO "\t\tshared block backref\n");
-			break;
-		case BTRFS_EXTENT_DATA_REF_KEY:
-			dref = btrfs_item_ptr(l, i,
-					      struct btrfs_extent_data_ref);
-			print_extent_data_ref(l, dref);
-			break;
-		case BTRFS_SHARED_DATA_REF_KEY:
-			sref = btrfs_item_ptr(l, i,
-					      struct btrfs_shared_data_ref);
-			printk(KERN_INFO "\t\tshared data backref count %u\n",
-			       btrfs_shared_data_ref_count(l, sref));
+			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+			printk(KERN_INFO "\t\textent data refs %u\n",
+				btrfs_extent_refs(l, ei));
+			break;
+		case BTRFS_EXTENT_REF_KEY:
+			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+			printk(KERN_INFO "\t\textent back ref root %llu "
+			       "gen %llu owner %llu num_refs %lu\n",
+			       (unsigned long long)btrfs_ref_root(l, ref),
+			       (unsigned long long)btrfs_ref_generation(l, ref),
+			       (unsigned long long)btrfs_ref_objectid(l, ref),
+			       (unsigned long)btrfs_ref_num_refs(l, ref));
 			break;
+
 		case BTRFS_EXTENT_DATA_KEY:
 			fi = btrfs_item_ptr(l, i,
 					    struct btrfs_file_extent_item);
@@ -254,12 +139,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			       (unsigned long long)
 			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
-		case BTRFS_EXTENT_REF_V0_KEY:
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-			print_extent_ref_v0(l, i);
-#else
-			BUG();
-#endif
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
diff --git a/trunk/fs/btrfs/relocation.c b/trunk/fs/btrfs/relocation.c
deleted file mode 100644
index b23dc209ae10..000000000000
--- a/trunk/fs/btrfs/relocation.c
+++ /dev/null
@@ -1,3711 +0,0 @@
-/*
- * Copyright (C) 2009 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/rbtree.h>
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "volumes.h"
-#include "locking.h"
-#include "btrfs_inode.h"
-#include "async-thread.h"
-
-/*
- * backref_node, mapping_node and tree_block start with this
- */
-struct tree_entry {
-	struct rb_node rb_node;
-	u64 bytenr;
-};
-
-/*
- * present a tree block in the backref cache
- */
-struct backref_node {
-	struct rb_node rb_node;
-	u64 bytenr;
-	/* objectid tree block owner */
-	u64 owner;
-	/* list of upper level blocks reference this block */
-	struct list_head upper;
-	/* list of child blocks in the cache */
-	struct list_head lower;
-	/* NULL if this node is not tree root */
-	struct btrfs_root *root;
-	/* extent buffer got by COW the block */
-	struct extent_buffer *eb;
-	/* level of tree block */
-	unsigned int level:8;
-	/* 1 if the block is root of old snapshot */
-	unsigned int old_root:1;
-	/* 1 if no child blocks in the cache */
-	unsigned int lowest:1;
-	/* is the extent buffer locked */
-	unsigned int locked:1;
-	/* has the block been processed */
-	unsigned int processed:1;
-	/* have backrefs of this block been checked */
-	unsigned int checked:1;
-};
-
-/*
- * present a block pointer in the backref cache
- */
-struct backref_edge {
-	struct list_head list[2];
-	struct backref_node *node[2];
-	u64 blockptr;
-};
-
-#define LOWER	0
-#define UPPER	1
-
-struct backref_cache {
-	/* red black tree of all backref nodes in the cache */
-	struct rb_root rb_root;
-	/* list of backref nodes with no child block in the cache */
-	struct list_head pending[BTRFS_MAX_LEVEL];
-	spinlock_t lock;
-};
-
-/*
- * map address of tree root to tree
- */
-struct mapping_node {
-	struct rb_node rb_node;
-	u64 bytenr;
-	void *data;
-};
-
-struct mapping_tree {
-	struct rb_root rb_root;
-	spinlock_t lock;
-};
-
-/*
- * present a tree block to process
- */
-struct tree_block {
-	struct rb_node rb_node;
-	u64 bytenr;
-	struct btrfs_key key;
-	unsigned int level:8;
-	unsigned int key_ready:1;
-};
-
-/* inode vector */
-#define INODEVEC_SIZE 16
-
-struct inodevec {
-	struct list_head list;
-	struct inode *inode[INODEVEC_SIZE];
-	int nr;
-};
-
-struct reloc_control {
-	/* block group to relocate */
-	struct btrfs_block_group_cache *block_group;
-	/* extent tree */
-	struct btrfs_root *extent_root;
-	/* inode for moving data */
-	struct inode *data_inode;
-	struct btrfs_workers workers;
-	/* tree blocks have been processed */
-	struct extent_io_tree processed_blocks;
-	/* map start of tree root to corresponding reloc tree */
-	struct mapping_tree reloc_root_tree;
-	/* list of reloc trees */
-	struct list_head reloc_roots;
-	u64 search_start;
-	u64 extents_found;
-	u64 extents_skipped;
-	int stage;
-	int create_reloc_root;
-	unsigned int found_file_extent:1;
-	unsigned int found_old_snapshot:1;
-};
-
-/* stages of data relocation */
-#define MOVE_DATA_EXTENTS	0
-#define UPDATE_DATA_PTRS	1
-
-/*
- * merge reloc tree to corresponding fs tree in worker threads
- */
-struct async_merge {
-	struct btrfs_work work;
-	struct reloc_control *rc;
-	struct btrfs_root *root;
-	struct completion *done;
-	atomic_t *num_pending;
-};
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
-	tree->rb_root.rb_node = NULL;
-	spin_lock_init(&tree->lock);
-}
-
-static void backref_cache_init(struct backref_cache *cache)
-{
-	int i;
-	cache->rb_root.rb_node = NULL;
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
-		INIT_LIST_HEAD(&cache->pending[i]);
-	spin_lock_init(&cache->lock);
-}
-
-static void backref_node_init(struct backref_node *node)
-{
-	memset(node, 0, sizeof(*node));
-	INIT_LIST_HEAD(&node->upper);
-	INIT_LIST_HEAD(&node->lower);
-	RB_CLEAR_NODE(&node->rb_node);
-}
-
-static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
-				   struct rb_node *node)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct tree_entry *entry;
-
-	while (*p) {
-		parent = *p;
-		entry = rb_entry(parent, struct tree_entry, rb_node);
-
-		if (bytenr < entry->bytenr)
-			p = &(*p)->rb_left;
-		else if (bytenr > entry->bytenr)
-			p = &(*p)->rb_right;
-		else
-			return parent;
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
-static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
-{
-	struct rb_node *n = root->rb_node;
-	struct tree_entry *entry;
-
-	while (n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
-
-		if (bytenr < entry->bytenr)
-			n = n->rb_left;
-		else if (bytenr > entry->bytenr)
-			n = n->rb_right;
-		else
-			return n;
-	}
-	return NULL;
-}
-
-/*
- * walk up backref nodes until reach node presents tree root
- */
-static struct backref_node *walk_up_backref(struct backref_node *node,
-					    struct backref_edge *edges[],
-					    int *index)
-{
-	struct backref_edge *edge;
-	int idx = *index;
-
-	while (!list_empty(&node->upper)) {
-		edge = list_entry(node->upper.next,
-				  struct backref_edge, list[LOWER]);
-		edges[idx++] = edge;
-		node = edge->node[UPPER];
-	}
-	*index = idx;
-	return node;
-}
-
-/*
- * walk down backref nodes to find start of next reference path
- */
-static struct backref_node *walk_down_backref(struct backref_edge *edges[],
-					      int *index)
-{
-	struct backref_edge *edge;
-	struct backref_node *lower;
-	int idx = *index;
-
-	while (idx > 0) {
-		edge = edges[idx - 1];
-		lower = edge->node[LOWER];
-		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
-			idx--;
-			continue;
-		}
-		edge = list_entry(edge->list[LOWER].next,
-				  struct backref_edge, list[LOWER]);
-		edges[idx - 1] = edge;
-		*index = idx;
-		return edge->node[UPPER];
-	}
-	*index = 0;
-	return NULL;
-}
-
-static void drop_node_buffer(struct backref_node *node)
-{
-	if (node->eb) {
-		if (node->locked) {
-			btrfs_tree_unlock(node->eb);
-			node->locked = 0;
-		}
-		free_extent_buffer(node->eb);
-		node->eb = NULL;
-	}
-}
-
-static void drop_backref_node(struct backref_cache *tree,
-			      struct backref_node *node)
-{
-	BUG_ON(!node->lowest);
-	BUG_ON(!list_empty(&node->upper));
-
-	drop_node_buffer(node);
-	list_del(&node->lower);
-
-	rb_erase(&node->rb_node, &tree->rb_root);
-	kfree(node);
-}
-
-/*
- * remove a backref node from the backref cache
- */
-static void remove_backref_node(struct backref_cache *cache,
-				struct backref_node *node)
-{
-	struct backref_node *upper;
-	struct backref_edge *edge;
-
-	if (!node)
-		return;
-
-	BUG_ON(!node->lowest);
-	while (!list_empty(&node->upper)) {
-		edge = list_entry(node->upper.next, struct backref_edge,
-				  list[LOWER]);
-		upper = edge->node[UPPER];
-		list_del(&edge->list[LOWER]);
-		list_del(&edge->list[UPPER]);
-		kfree(edge);
-		/*
-		 * add the node to pending list if no other
-		 * child block cached.
-		 */
-		if (list_empty(&upper->lower)) {
-			list_add_tail(&upper->lower,
-				      &cache->pending[upper->level]);
-			upper->lowest = 1;
-		}
-	}
-	drop_backref_node(cache, node);
-}
-
-/*
- * find reloc tree by address of tree root
- */
-static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
-					  u64 bytenr)
-{
-	struct rb_node *rb_node;
-	struct mapping_node *node;
-	struct btrfs_root *root = NULL;
-
-	spin_lock(&rc->reloc_root_tree.lock);
-	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
-	if (rb_node) {
-		node = rb_entry(rb_node, struct mapping_node, rb_node);
-		root = (struct btrfs_root *)node->data;
-	}
-	spin_unlock(&rc->reloc_root_tree.lock);
-	return root;
-}
-
-static int is_cowonly_root(u64 root_objectid)
-{
-	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
-	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
-	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
-	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
-	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
-		return 1;
-	return 0;
-}
-
-static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
-					u64 root_objectid)
-{
-	struct btrfs_key key;
-
-	key.objectid = root_objectid;
-	key.type = BTRFS_ROOT_ITEM_KEY;
-	if (is_cowonly_root(root_objectid))
-		key.offset = 0;
-	else
-		key.offset = (u64)-1;
-
-	return btrfs_read_fs_root_no_name(fs_info, &key);
-}
-
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-static noinline_for_stack
-struct btrfs_root *find_tree_root(struct reloc_control *rc,
-				  struct extent_buffer *leaf,
-				  struct btrfs_extent_ref_v0 *ref0)
-{
-	struct btrfs_root *root;
-	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
-	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
-
-	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
-
-	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
-	BUG_ON(IS_ERR(root));
-
-	if (root->ref_cows &&
-	    generation != btrfs_root_generation(&root->root_item))
-		return NULL;
-
-	return root;
-}
-#endif
-
-static noinline_for_stack
-int find_inline_backref(struct extent_buffer *leaf, int slot,
-			unsigned long *ptr, unsigned long *end)
-{
-	struct btrfs_extent_item *ei;
-	struct btrfs_tree_block_info *bi;
-	u32 item_size;
-
-	item_size = btrfs_item_size_nr(leaf, slot);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (item_size < sizeof(*ei)) {
-		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
-		return 1;
-	}
-#endif
-	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
-	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
-		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
-
-	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
-		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
-		return 1;
-	}
-
-	bi = (struct btrfs_tree_block_info *)(ei + 1);
-	*ptr = (unsigned long)(bi + 1);
-	*end = (unsigned long)ei + item_size;
-	return 0;
-}
-
-/*
- * build backref tree for a given tree block. root of the backref tree
- * corresponds the tree block, leaves of the backref tree correspond
- * roots of b-trees that reference the tree block.
- *
- * the basic idea of this function is check backrefs of a given block
- * to find upper level blocks that refernece the block, and then check
- * bakcrefs of these upper level blocks recursively. the recursion stop
- * when tree root is reached or backrefs for the block is cached.
- *
- * NOTE: if we find backrefs for a block are cached, we know backrefs
- * for all upper level blocks that directly/indirectly reference the
- * block are also cached.
- */
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
-					       struct backref_cache *cache,
-					       struct btrfs_key *node_key,
-					       int level, u64 bytenr)
-{
-	struct btrfs_path *path1;
-	struct btrfs_path *path2;
-	struct extent_buffer *eb;
-	struct btrfs_root *root;
-	struct backref_node *cur;
-	struct backref_node *upper;
-	struct backref_node *lower;
-	struct backref_node *node = NULL;
-	struct backref_node *exist = NULL;
-	struct backref_edge *edge;
-	struct rb_node *rb_node;
-	struct btrfs_key key;
-	unsigned long end;
-	unsigned long ptr;
-	LIST_HEAD(list);
-	int ret;
-	int err = 0;
-
-	path1 = btrfs_alloc_path();
-	path2 = btrfs_alloc_path();
-	if (!path1 || !path2) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	node = kmalloc(sizeof(*node), GFP_NOFS);
-	if (!node) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	backref_node_init(node);
-	node->bytenr = bytenr;
-	node->owner = 0;
-	node->level = level;
-	node->lowest = 1;
-	cur = node;
-again:
-	end = 0;
-	ptr = 0;
-	key.objectid = cur->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = (u64)-1;
-
-	path1->search_commit_root = 1;
-	path1->skip_locking = 1;
-	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
-				0, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	BUG_ON(!ret || !path1->slots[0]);
-
-	path1->slots[0]--;
-
-	WARN_ON(cur->checked);
-	if (!list_empty(&cur->upper)) {
-		/*
-		 * the backref was added previously when processsing
-		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
-		 */
-		BUG_ON(!list_is_singular(&cur->upper));
-		edge = list_entry(cur->upper.next, struct backref_edge,
-				  list[LOWER]);
-		BUG_ON(!list_empty(&edge->list[UPPER]));
-		exist = edge->node[UPPER];
-		/*
-		 * add the upper level block to pending list if we need
-		 * check its backrefs
-		 */
-		if (!exist->checked)
-			list_add_tail(&edge->list[UPPER], &list);
-	} else {
-		exist = NULL;
-	}
-
-	while (1) {
-		cond_resched();
-		eb = path1->nodes[0];
-
-		if (ptr >= end) {
-			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
-				ret = btrfs_next_leaf(rc->extent_root, path1);
-				if (ret < 0) {
-					err = ret;
-					goto out;
-				}
-				if (ret > 0)
-					break;
-				eb = path1->nodes[0];
-			}
-
-			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
-			if (key.objectid != cur->bytenr) {
-				WARN_ON(exist);
-				break;
-			}
-
-			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
-				ret = find_inline_backref(eb, path1->slots[0],
-							  &ptr, &end);
-				if (ret)
-					goto next;
-			}
-		}
-
-		if (ptr < end) {
-			/* update key for inline back ref */
-			struct btrfs_extent_inline_ref *iref;
-			iref = (struct btrfs_extent_inline_ref *)ptr;
-			key.type = btrfs_extent_inline_ref_type(eb, iref);
-			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
-			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
-				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
-		}
-
-		if (exist &&
-		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
-		      exist->owner == key.offset) ||
-		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
-		      exist->bytenr == key.offset))) {
-			exist = NULL;
-			goto next;
-		}
-
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
-		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-			if (key.objectid == key.offset &&
-			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-				struct btrfs_extent_ref_v0 *ref0;
-				ref0 = btrfs_item_ptr(eb, path1->slots[0],
-						struct btrfs_extent_ref_v0);
-				root = find_tree_root(rc, eb, ref0);
-				if (root)
-					cur->root = root;
-				else
-					cur->old_root = 1;
-				break;
-			}
-#else
-		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
-		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
-#endif
-			if (key.objectid == key.offset) {
-				/*
-				 * only root blocks of reloc trees use
-				 * backref of this type.
-				 */
-				root = find_reloc_root(rc, cur->bytenr);
-				BUG_ON(!root);
-				cur->root = root;
-				break;
-			}
-
-			edge = kzalloc(sizeof(*edge), GFP_NOFS);
-			if (!edge) {
-				err = -ENOMEM;
-				goto out;
-			}
-			rb_node = tree_search(&cache->rb_root, key.offset);
-			if (!rb_node) {
-				upper = kmalloc(sizeof(*upper), GFP_NOFS);
-				if (!upper) {
-					kfree(edge);
-					err = -ENOMEM;
-					goto out;
-				}
-				backref_node_init(upper);
-				upper->bytenr = key.offset;
-				upper->owner = 0;
-				upper->level = cur->level + 1;
-				/*
-				 *  backrefs for the upper level block isn't
-				 *  cached, add the block to pending list
-				 */
-				list_add_tail(&edge->list[UPPER], &list);
-			} else {
-				upper = rb_entry(rb_node, struct backref_node,
-						 rb_node);
-				INIT_LIST_HEAD(&edge->list[UPPER]);
-			}
-			list_add(&edge->list[LOWER], &cur->upper);
-			edge->node[UPPER] = upper;
-			edge->node[LOWER] = cur;
-
-			goto next;
-		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
-			goto next;
-		}
-
-		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
-		root = read_fs_root(rc->extent_root->fs_info, key.offset);
-		if (IS_ERR(root)) {
-			err = PTR_ERR(root);
-			goto out;
-		}
-
-		if (btrfs_root_level(&root->root_item) == cur->level) {
-			/* tree root */
-			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
-			       cur->bytenr);
-			cur->root = root;
-			break;
-		}
-
-		level = cur->level + 1;
-
-		/*
-		 * searching the tree to find upper level blocks
-		 * reference the block.
-		 */
-		path2->search_commit_root = 1;
-		path2->skip_locking = 1;
-		path2->lowest_level = level;
-		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
-		path2->lowest_level = 0;
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-
-		eb = path2->nodes[level];
-		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
-			cur->bytenr);
-
-		lower = cur;
-		for (; level < BTRFS_MAX_LEVEL; level++) {
-			if (!path2->nodes[level]) {
-				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
-				       lower->bytenr);
-				lower->root = root;
-				break;
-			}
-
-			edge = kzalloc(sizeof(*edge), GFP_NOFS);
-			if (!edge) {
-				err = -ENOMEM;
-				goto out;
-			}
-
-			eb = path2->nodes[level];
-			rb_node = tree_search(&cache->rb_root, eb->start);
-			if (!rb_node) {
-				upper = kmalloc(sizeof(*upper), GFP_NOFS);
-				if (!upper) {
-					kfree(edge);
-					err = -ENOMEM;
-					goto out;
-				}
-				backref_node_init(upper);
-				upper->bytenr = eb->start;
-				upper->owner = btrfs_header_owner(eb);
-				upper->level = lower->level + 1;
-
-				/*
-				 * if we know the block isn't shared
-				 * we can void checking its backrefs.
-				 */
-				if (btrfs_block_can_be_shared(root, eb))
-					upper->checked = 0;
-				else
-					upper->checked = 1;
-
-				/*
-				 * add the block to pending list if we
-				 * need check its backrefs. only block
-				 * at 'cur->level + 1' is added to the
-				 * tail of pending list. this guarantees
-				 * we check backrefs from lower level
-				 * blocks to upper level blocks.
-				 */
-				if (!upper->checked &&
-				    level == cur->level + 1) {
-					list_add_tail(&edge->list[UPPER],
-						      &list);
-				} else
-					INIT_LIST_HEAD(&edge->list[UPPER]);
-			} else {
-				upper = rb_entry(rb_node, struct backref_node,
-						 rb_node);
-				BUG_ON(!upper->checked);
-				INIT_LIST_HEAD(&edge->list[UPPER]);
-			}
-			list_add_tail(&edge->list[LOWER], &lower->upper);
-			edge->node[UPPER] = upper;
-			edge->node[LOWER] = lower;
-
-			if (rb_node)
-				break;
-			lower = upper;
-			upper = NULL;
-		}
-		btrfs_release_path(root, path2);
-next:
-		if (ptr < end) {
-			ptr += btrfs_extent_inline_ref_size(key.type);
-			if (ptr >= end) {
-				WARN_ON(ptr > end);
-				ptr = 0;
-				end = 0;
-			}
-		}
-		if (ptr >= end)
-			path1->slots[0]++;
-	}
-	btrfs_release_path(rc->extent_root, path1);
-
-	cur->checked = 1;
-	WARN_ON(exist);
-
-	/* the pending list isn't empty, take the first block to process */
-	if (!list_empty(&list)) {
-		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
-		list_del_init(&edge->list[UPPER]);
-		cur = edge->node[UPPER];
-		goto again;
-	}
-
-	/*
-	 * everything goes well, connect backref nodes and insert backref nodes
-	 * into the cache.
-	 */
-	BUG_ON(!node->checked);
-	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
-	BUG_ON(rb_node);
-
-	list_for_each_entry(edge, &node->upper, list[LOWER])
-		list_add_tail(&edge->list[UPPER], &list);
-
-	while (!list_empty(&list)) {
-		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
-		list_del_init(&edge->list[UPPER]);
-		upper = edge->node[UPPER];
-
-		if (!RB_EMPTY_NODE(&upper->rb_node)) {
-			if (upper->lowest) {
-				list_del_init(&upper->lower);
-				upper->lowest = 0;
-			}
-
-			list_add_tail(&edge->list[UPPER], &upper->lower);
-			continue;
-		}
-
-		BUG_ON(!upper->checked);
-		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
-				      &upper->rb_node);
-		BUG_ON(rb_node);
-
-		list_add_tail(&edge->list[UPPER], &upper->lower);
-
-		list_for_each_entry(edge, &upper->upper, list[LOWER])
-			list_add_tail(&edge->list[UPPER], &list);
-	}
-out:
-	btrfs_free_path(path1);
-	btrfs_free_path(path2);
-	if (err) {
-		INIT_LIST_HEAD(&list);
-		upper = node;
-		while (upper) {
-			if (RB_EMPTY_NODE(&upper->rb_node)) {
-				list_splice_tail(&upper->upper, &list);
-				kfree(upper);
-			}
-
-			if (list_empty(&list))
-				break;
-
-			edge = list_entry(list.next, struct backref_edge,
-					  list[LOWER]);
-			upper = edge->node[UPPER];
-			kfree(edge);
-		}
-		return ERR_PTR(err);
-	}
-	return node;
-}
-
-/*
- * helper to add 'address of tree root -> reloc tree' mapping
- */
-static int __add_reloc_root(struct btrfs_root *root)
-{
-	struct rb_node *rb_node;
-	struct mapping_node *node;
-	struct reloc_control *rc = root->fs_info->reloc_ctl;
-
-	node = kmalloc(sizeof(*node), GFP_NOFS);
-	BUG_ON(!node);
-
-	node->bytenr = root->node->start;
-	node->data = root;
-
-	spin_lock(&rc->reloc_root_tree.lock);
-	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
-			      node->bytenr, &node->rb_node);
-	spin_unlock(&rc->reloc_root_tree.lock);
-	BUG_ON(rb_node);
-
-	list_add_tail(&root->root_list, &rc->reloc_roots);
-	return 0;
-}
-
-/*
- * helper to update/delete the 'address of tree root -> reloc tree'
- * mapping
- */
-static int __update_reloc_root(struct btrfs_root *root, int del)
-{
-	struct rb_node *rb_node;
-	struct mapping_node *node = NULL;
-	struct reloc_control *rc = root->fs_info->reloc_ctl;
-
-	spin_lock(&rc->reloc_root_tree.lock);
-	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
-			      root->commit_root->start);
-	if (rb_node) {
-		node = rb_entry(rb_node, struct mapping_node, rb_node);
-		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
-	}
-	spin_unlock(&rc->reloc_root_tree.lock);
-
-	BUG_ON((struct btrfs_root *)node->data != root);
-
-	if (!del) {
-		spin_lock(&rc->reloc_root_tree.lock);
-		node->bytenr = root->node->start;
-		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
-				      node->bytenr, &node->rb_node);
-		spin_unlock(&rc->reloc_root_tree.lock);
-		BUG_ON(rb_node);
-	} else {
-		list_del_init(&root->root_list);
-		kfree(node);
-	}
-	return 0;
-}
-
-/*
- * create reloc tree for a given fs tree. reloc tree is just a
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root)
-{
-	struct btrfs_root *reloc_root;
-	struct extent_buffer *eb;
-	struct btrfs_root_item *root_item;
-	struct btrfs_key root_key;
-	int ret;
-
-	if (root->reloc_root) {
-		reloc_root = root->reloc_root;
-		reloc_root->last_trans = trans->transid;
-		return 0;
-	}
-
-	if (!root->fs_info->reloc_ctl ||
-	    !root->fs_info->reloc_ctl->create_reloc_root ||
-	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-		return 0;
-
-	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-	BUG_ON(!root_item);
-
-	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	root_key.offset = root->root_key.objectid;
-
-	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
-			      BTRFS_TREE_RELOC_OBJECTID);
-	BUG_ON(ret);
-
-	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
-	memcpy(root_item, &root->root_item, sizeof(*root_item));
-	btrfs_set_root_refs(root_item, 1);
-	btrfs_set_root_bytenr(root_item, eb->start);
-	btrfs_set_root_level(root_item, btrfs_header_level(eb));
-	btrfs_set_root_generation(root_item, trans->transid);
-	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
-	root_item->drop_level = 0;
-
-	btrfs_tree_unlock(eb);
-	free_extent_buffer(eb);
-
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
-				&root_key, root_item);
-	BUG_ON(ret);
-	kfree(root_item);
-
-	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						 &root_key);
-	BUG_ON(IS_ERR(reloc_root));
-	reloc_root->last_trans = trans->transid;
-
-	__add_reloc_root(reloc_root);
-	root->reloc_root = reloc_root;
-	return 0;
-}
-
-/*
- * update root item of reloc tree
- */
-int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root)
-{
-	struct btrfs_root *reloc_root;
-	struct btrfs_root_item *root_item;
-	int del = 0;
-	int ret;
-
-	if (!root->reloc_root)
-		return 0;
-
-	reloc_root = root->reloc_root;
-	root_item = &reloc_root->root_item;
-
-	if (btrfs_root_refs(root_item) == 0) {
-		root->reloc_root = NULL;
-		del = 1;
-	}
-
-	__update_reloc_root(reloc_root, del);
-
-	if (reloc_root->commit_root != reloc_root->node) {
-		btrfs_set_root_node(root_item, reloc_root->node);
-		free_extent_buffer(reloc_root->commit_root);
-		reloc_root->commit_root = btrfs_root_node(reloc_root);
-	}
-
-	ret = btrfs_update_root(trans, root->fs_info->tree_root,
-				&reloc_root->root_key, root_item);
-	BUG_ON(ret);
-	return 0;
-}
-
-/*
- * helper to find first cached inode with inode number >= objectid
- * in a subvolume
- */
-static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
-{
-	struct rb_node *node;
-	struct rb_node *prev;
-	struct btrfs_inode *entry;
-	struct inode *inode;
-
-	spin_lock(&root->inode_lock);
-again:
-	node = root->inode_tree.rb_node;
-	prev = NULL;
-	while (node) {
-		prev = node;
-		entry = rb_entry(node, struct btrfs_inode, rb_node);
-
-		if (objectid < entry->vfs_inode.i_ino)
-			node = node->rb_left;
-		else if (objectid > entry->vfs_inode.i_ino)
-			node = node->rb_right;
-		else
-			break;
-	}
-	if (!node) {
-		while (prev) {
-			entry = rb_entry(prev, struct btrfs_inode, rb_node);
-			if (objectid <= entry->vfs_inode.i_ino) {
-				node = prev;
-				break;
-			}
-			prev = rb_next(prev);
-		}
-	}
-	while (node) {
-		entry = rb_entry(node, struct btrfs_inode, rb_node);
-		inode = igrab(&entry->vfs_inode);
-		if (inode) {
-			spin_unlock(&root->inode_lock);
-			return inode;
-		}
-
-		objectid = entry->vfs_inode.i_ino + 1;
-		if (cond_resched_lock(&root->inode_lock))
-			goto again;
-
-		node = rb_next(node);
-	}
-	spin_unlock(&root->inode_lock);
-	return NULL;
-}
-
-static int in_block_group(u64 bytenr,
-			  struct btrfs_block_group_cache *block_group)
-{
-	if (bytenr >= block_group->key.objectid &&
-	    bytenr < block_group->key.objectid + block_group->key.offset)
-		return 1;
-	return 0;
-}
-
-/*
- * get new location of data
- */
-static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
-			    u64 bytenr, u64 num_bytes)
-{
-	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-	struct btrfs_path *path;
-	struct btrfs_file_extent_item *fi;
-	struct extent_buffer *leaf;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
-	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
-				       bytenr, 0);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	leaf = path->nodes[0];
-	fi = btrfs_item_ptr(leaf, path->slots[0],
-			    struct btrfs_file_extent_item);
-
-	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
-	       btrfs_file_extent_compression(leaf, fi) ||
-	       btrfs_file_extent_encryption(leaf, fi) ||
-	       btrfs_file_extent_other_encoding(leaf, fi));
-
-	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
-		ret = 1;
-		goto out;
-	}
-
-	if (new_bytenr)
-		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-	ret = 0;
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
- * update file extent items in the tree leaf to point to
- * the new locations.
- */
-static int replace_file_extents(struct btrfs_trans_handle *trans,
-				struct reloc_control *rc,
-				struct btrfs_root *root,
-				struct extent_buffer *leaf,
-				struct list_head *inode_list)
-{
-	struct btrfs_key key;
-	struct btrfs_file_extent_item *fi;
-	struct inode *inode = NULL;
-	struct inodevec *ivec = NULL;
-	u64 parent;
-	u64 bytenr;
-	u64 new_bytenr;
-	u64 num_bytes;
-	u64 end;
-	u32 nritems;
-	u32 i;
-	int ret;
-	int first = 1;
-	int dirty = 0;
-
-	if (rc->stage != UPDATE_DATA_PTRS)
-		return 0;
-
-	/* reloc trees always use full backref */
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-		parent = leaf->start;
-	else
-		parent = 0;
-
-	nritems = btrfs_header_nritems(leaf);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		btrfs_item_key_to_cpu(leaf, &key, i);
-		if (key.type != BTRFS_EXTENT_DATA_KEY)
-			continue;
-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			continue;
-		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-		if (bytenr == 0)
-			continue;
-		if (!in_block_group(bytenr, rc->block_group))
-			continue;
-
-		/*
-		 * if we are modifying block in fs tree, wait for readpage
-		 * to complete and drop the extent cache
-		 */
-		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-			if (!ivec || ivec->nr == INODEVEC_SIZE) {
-				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
-				BUG_ON(!ivec);
-				ivec->nr = 0;
-				list_add_tail(&ivec->list, inode_list);
-			}
-			if (first) {
-				inode = find_next_inode(root, key.objectid);
-				if (inode)
-					ivec->inode[ivec->nr++] = inode;
-				first = 0;
-			} else if (inode && inode->i_ino < key.objectid) {
-				inode = find_next_inode(root, key.objectid);
-				if (inode)
-					ivec->inode[ivec->nr++] = inode;
-			}
-			if (inode && inode->i_ino == key.objectid) {
-				end = key.offset +
-				      btrfs_file_extent_num_bytes(leaf, fi);
-				WARN_ON(!IS_ALIGNED(key.offset,
-						    root->sectorsize));
-				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
-				end--;
-				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-						      key.offset, end,
-						      GFP_NOFS);
-				if (!ret)
-					continue;
-
-				btrfs_drop_extent_cache(inode, key.offset, end,
-							1);
-				unlock_extent(&BTRFS_I(inode)->io_tree,
-					      key.offset, end, GFP_NOFS);
-			}
-		}
-
-		ret = get_new_location(rc->data_inode, &new_bytenr,
-				       bytenr, num_bytes);
-		if (ret > 0)
-			continue;
-		BUG_ON(ret < 0);
-
-		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
-		dirty = 1;
-
-		key.offset -= btrfs_file_extent_offset(leaf, fi);
-		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
-					   num_bytes, parent,
-					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset);
-		BUG_ON(ret);
-
-		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset);
-		BUG_ON(ret);
-	}
-	if (dirty)
-		btrfs_mark_buffer_dirty(leaf);
-	return 0;
-}
-
-static noinline_for_stack
-int memcmp_node_keys(struct extent_buffer *eb, int slot,
-		     struct btrfs_path *path, int level)
-{
-	struct btrfs_disk_key key1;
-	struct btrfs_disk_key key2;
-	btrfs_node_key(eb, &key1, slot);
-	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
-	return memcmp(&key1, &key2, sizeof(key1));
-}
-
-/*
- * try to replace tree blocks in fs tree with the new blocks
- * in reloc tree. tree blocks haven't been modified since the
- * reloc tree was create can be replaced.
- *
- * if a block was replaced, level of the block + 1 is returned.
- * if no block got replaced, 0 is returned. if there are other
- * errors, a negative error number is returned.
- */
-static int replace_path(struct btrfs_trans_handle *trans,
-			struct btrfs_root *dest, struct btrfs_root *src,
-			struct btrfs_path *path, struct btrfs_key *next_key,
-			struct extent_buffer **leaf,
-			int lowest_level, int max_level)
-{
-	struct extent_buffer *eb;
-	struct extent_buffer *parent;
-	struct btrfs_key key;
-	u64 old_bytenr;
-	u64 new_bytenr;
-	u64 old_ptr_gen;
-	u64 new_ptr_gen;
-	u64 last_snapshot;
-	u32 blocksize;
-	int level;
-	int ret;
-	int slot;
-
-	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
-	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
-	BUG_ON(lowest_level > 1 && leaf);
-
-	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
-
-	slot = path->slots[lowest_level];
-	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
-
-	eb = btrfs_lock_root_node(dest);
-	btrfs_set_lock_blocking(eb);
-	level = btrfs_header_level(eb);
-
-	if (level < lowest_level) {
-		btrfs_tree_unlock(eb);
-		free_extent_buffer(eb);
-		return 0;
-	}
-
-	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
-	BUG_ON(ret);
-	btrfs_set_lock_blocking(eb);
-
-	if (next_key) {
-		next_key->objectid = (u64)-1;
-		next_key->type = (u8)-1;
-		next_key->offset = (u64)-1;
-	}
-
-	parent = eb;
-	while (1) {
-		level = btrfs_header_level(parent);
-		BUG_ON(level < lowest_level);
-
-		ret = btrfs_bin_search(parent, &key, level, &slot);
-		if (ret && slot > 0)
-			slot--;
-
-		if (next_key && slot + 1 < btrfs_header_nritems(parent))
-			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
-
-		old_bytenr = btrfs_node_blockptr(parent, slot);
-		blocksize = btrfs_level_size(dest, level - 1);
-		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
-
-		if (level <= max_level) {
-			eb = path->nodes[level];
-			new_bytenr = btrfs_node_blockptr(eb,
-							path->slots[level]);
-			new_ptr_gen = btrfs_node_ptr_generation(eb,
-							path->slots[level]);
-		} else {
-			new_bytenr = 0;
-			new_ptr_gen = 0;
-		}
-
-		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
-			WARN_ON(1);
-			ret = level;
-			break;
-		}
-
-		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
-		    memcmp_node_keys(parent, slot, path, level)) {
-			if (level <= lowest_level && !leaf) {
-				ret = 0;
-				break;
-			}
-
-			eb = read_tree_block(dest, old_bytenr, blocksize,
-					     old_ptr_gen);
-			btrfs_tree_lock(eb);
-			ret = btrfs_cow_block(trans, dest, eb, parent,
-					      slot, &eb);
-			BUG_ON(ret);
-			btrfs_set_lock_blocking(eb);
-
-			if (level <= lowest_level) {
-				*leaf = eb;
-				ret = 0;
-				break;
-			}
-
-			btrfs_tree_unlock(parent);
-			free_extent_buffer(parent);
-
-			parent = eb;
-			continue;
-		}
-
-		btrfs_node_key_to_cpu(path->nodes[level], &key,
-				      path->slots[level]);
-		btrfs_release_path(src, path);
-
-		path->lowest_level = level;
-		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
-		path->lowest_level = 0;
-		BUG_ON(ret);
-
-		/*
-		 * swap blocks in fs tree and reloc tree.
-		 */
-		btrfs_set_node_blockptr(parent, slot, new_bytenr);
-		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
-		btrfs_mark_buffer_dirty(parent);
-
-		btrfs_set_node_blockptr(path->nodes[level],
-					path->slots[level], old_bytenr);
-		btrfs_set_node_ptr_generation(path->nodes[level],
-					      path->slots[level], old_ptr_gen);
-		btrfs_mark_buffer_dirty(path->nodes[level]);
-
-		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
-					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
-		BUG_ON(ret);
-		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
-					0, dest->root_key.objectid, level - 1,
-					0);
-		BUG_ON(ret);
-
-		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
-					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
-		BUG_ON(ret);
-
-		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
-					0, dest->root_key.objectid, level - 1,
-					0);
-		BUG_ON(ret);
-
-		btrfs_unlock_up_safe(path, 0);
-
-		ret = level;
-		break;
-	}
-	btrfs_tree_unlock(parent);
-	free_extent_buffer(parent);
-	return ret;
-}
-
-/*
- * helper to find next relocated block in reloc tree
- */
-static noinline_for_stack
-int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
-		       int *level)
-{
-	struct extent_buffer *eb;
-	int i;
-	u64 last_snapshot;
-	u32 nritems;
-
-	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
-
-	for (i = 0; i < *level; i++) {
-		free_extent_buffer(path->nodes[i]);
-		path->nodes[i] = NULL;
-	}
-
-	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
-		eb = path->nodes[i];
-		nritems = btrfs_header_nritems(eb);
-		while (path->slots[i] + 1 < nritems) {
-			path->slots[i]++;
-			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
-			    last_snapshot)
-				continue;
-
-			*level = i;
-			return 0;
-		}
-		free_extent_buffer(path->nodes[i]);
-		path->nodes[i] = NULL;
-	}
-	return 1;
-}
-
-/*
- * walk down reloc tree to find relocated block of lowest level
- */
-static noinline_for_stack
-int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
-			 int *level)
-{
-	struct extent_buffer *eb = NULL;
-	int i;
-	u64 bytenr;
-	u64 ptr_gen = 0;
-	u64 last_snapshot;
-	u32 blocksize;
-	u32 nritems;
-
-	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
-
-	for (i = *level; i > 0; i--) {
-		eb = path->nodes[i];
-		nritems = btrfs_header_nritems(eb);
-		while (path->slots[i] < nritems) {
-			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
-			if (ptr_gen > last_snapshot)
-				break;
-			path->slots[i]++;
-		}
-		if (path->slots[i] >= nritems) {
-			if (i == *level)
-				break;
-			*level = i + 1;
-			return 0;
-		}
-		if (i == 1) {
-			*level = i;
-			return 0;
-		}
-
-		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
-		blocksize = btrfs_level_size(root, i - 1);
-		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
-		BUG_ON(btrfs_header_level(eb) != i - 1);
-		path->nodes[i - 1] = eb;
-		path->slots[i - 1] = 0;
-	}
-	return 1;
-}
-
-/*
- * invalidate extent cache for file extents whose key in range of
- * [min_key, max_key)
- */
-static int invalidate_extent_cache(struct btrfs_root *root,
-				   struct btrfs_key *min_key,
-				   struct btrfs_key *max_key)
-{
-	struct inode *inode = NULL;
-	u64 objectid;
-	u64 start, end;
-
-	objectid = min_key->objectid;
-	while (1) {
-		cond_resched();
-		iput(inode);
-
-		if (objectid > max_key->objectid)
-			break;
-
-		inode = find_next_inode(root, objectid);
-		if (!inode)
-			break;
-
-		if (inode->i_ino > max_key->objectid) {
-			iput(inode);
-			break;
-		}
-
-		objectid = inode->i_ino + 1;
-		if (!S_ISREG(inode->i_mode))
-			continue;
-
-		if (unlikely(min_key->objectid == inode->i_ino)) {
-			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
-				continue;
-			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
-				start = 0;
-			else {
-				start = min_key->offset;
-				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
-			}
-		} else {
-			start = 0;
-		}
-
-		if (unlikely(max_key->objectid == inode->i_ino)) {
-			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
-				continue;
-			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
-				end = (u64)-1;
-			} else {
-				if (max_key->offset == 0)
-					continue;
-				end = max_key->offset;
-				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
-				end--;
-			}
-		} else {
-			end = (u64)-1;
-		}
-
-		/* the lock_extent waits for readpage to complete */
-		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-		btrfs_drop_extent_cache(inode, start, end, 1);
-		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-	}
-	return 0;
-}
-
-static int find_next_key(struct btrfs_path *path, int level,
-			 struct btrfs_key *key)
-
-{
-	while (level < BTRFS_MAX_LEVEL) {
-		if (!path->nodes[level])
-			break;
-		if (path->slots[level] + 1 <
-		    btrfs_header_nritems(path->nodes[level])) {
-			btrfs_node_key_to_cpu(path->nodes[level], key,
-					      path->slots[level] + 1);
-			return 0;
-		}
-		level++;
-	}
-	return 1;
-}
-
-/*
- * merge the relocated tree blocks in reloc tree with corresponding
- * fs tree.
- */
-static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
-					       struct btrfs_root *root)
-{
-	LIST_HEAD(inode_list);
-	struct btrfs_key key;
-	struct btrfs_key next_key;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *reloc_root;
-	struct btrfs_root_item *root_item;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf = NULL;
-	unsigned long nr;
-	int level;
-	int max_level;
-	int replaced = 0;
-	int ret;
-	int err = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	reloc_root = root->reloc_root;
-	root_item = &reloc_root->root_item;
-
-	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-		level = btrfs_root_level(root_item);
-		extent_buffer_get(reloc_root->node);
-		path->nodes[level] = reloc_root->node;
-		path->slots[level] = 0;
-	} else {
-		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
-
-		level = root_item->drop_level;
-		BUG_ON(level == 0);
-		path->lowest_level = level;
-		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
-		if (ret < 0) {
-			btrfs_free_path(path);
-			return ret;
-		}
-
-		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
-				      path->slots[level]);
-		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
-
-		btrfs_unlock_up_safe(path, 0);
-	}
-
-	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-		trans = btrfs_start_transaction(root, 1);
-
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, 0);
-		btrfs_release_path(reloc_root, path);
-
-		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-
-		leaf = path->nodes[0];
-		btrfs_unlock_up_safe(path, 1);
-		ret = replace_file_extents(trans, rc, root, leaf,
-					   &inode_list);
-		if (ret < 0)
-			err = ret;
-		goto out;
-	}
-
-	memset(&next_key, 0, sizeof(next_key));
-
-	while (1) {
-		leaf = NULL;
-		replaced = 0;
-		trans = btrfs_start_transaction(root, 1);
-		max_level = level;
-
-		ret = walk_down_reloc_tree(reloc_root, path, &level);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		if (ret > 0)
-			break;
-
-		if (!find_next_key(path, level, &key) &&
-		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
-			ret = 0;
-		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
-			ret = replace_path(trans, root, reloc_root,
-					   path, &next_key, &leaf,
-					   level, max_level);
-		} else {
-			ret = replace_path(trans, root, reloc_root,
-					   path, &next_key, NULL,
-					   level, max_level);
-		}
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-
-		if (ret > 0) {
-			level = ret;
-			btrfs_node_key_to_cpu(path->nodes[level], &key,
-					      path->slots[level]);
-			replaced = 1;
-		} else if (leaf) {
-			/*
-			 * no block got replaced, try replacing file extents
-			 */
-			btrfs_item_key_to_cpu(leaf, &key, 0);
-			ret = replace_file_extents(trans, rc, root, leaf,
-						   &inode_list);
-			btrfs_tree_unlock(leaf);
-			free_extent_buffer(leaf);
-			BUG_ON(ret < 0);
-		}
-
-		ret = walk_up_reloc_tree(reloc_root, path, &level);
-		if (ret > 0)
-			break;
-
-		BUG_ON(level == 0);
-		/*
-		 * save the merging progress in the drop_progress.
-		 * this is OK since root refs == 1 in this case.
-		 */
-		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
-			       path->slots[level]);
-		root_item->drop_level = level;
-
-		nr = trans->blocks_used;
-		btrfs_end_transaction(trans, root);
-
-		btrfs_btree_balance_dirty(root, nr);
-
-		if (replaced && rc->stage == UPDATE_DATA_PTRS)
-			invalidate_extent_cache(root, &key, &next_key);
-	}
-
-	/*
-	 * handle the case only one block in the fs tree need to be
-	 * relocated and the block is tree root.
-	 */
-	leaf = btrfs_lock_root_node(root);
-	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
-	btrfs_tree_unlock(leaf);
-	free_extent_buffer(leaf);
-	if (ret < 0)
-		err = ret;
-out:
-	btrfs_free_path(path);
-
-	if (err == 0) {
-		memset(&root_item->drop_progress, 0,
-		       sizeof(root_item->drop_progress));
-		root_item->drop_level = 0;
-		btrfs_set_root_refs(root_item, 0);
-	}
-
-	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
-
-	btrfs_btree_balance_dirty(root, nr);
-
-	/*
-	 * put inodes while we aren't holding the tree locks
-	 */
-	while (!list_empty(&inode_list)) {
-		struct inodevec *ivec;
-		ivec = list_entry(inode_list.next, struct inodevec, list);
-		list_del(&ivec->list);
-		while (ivec->nr > 0) {
-			ivec->nr--;
-			iput(ivec->inode[ivec->nr]);
-		}
-		kfree(ivec);
-	}
-
-	if (replaced && rc->stage == UPDATE_DATA_PTRS)
-		invalidate_extent_cache(root, &key, &next_key);
-
-	return err;
-}
-
-/*
- * callback for the work threads.
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
-	struct btrfs_root *reloc_root;
-	struct async_merge *async;
-
-	async = container_of(work, struct async_merge, work);
-	reloc_root = async->root;
-
-	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-		root = read_fs_root(reloc_root->fs_info,
-				    reloc_root->root_key.offset);
-		BUG_ON(IS_ERR(root));
-		BUG_ON(root->reloc_root != reloc_root);
-
-		merge_reloc_root(async->rc, root);
-
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_update_reloc_root(trans, root);
-		btrfs_end_transaction(trans, root);
-	}
-
-	btrfs_drop_dead_root(reloc_root);
-
-	if (atomic_dec_and_test(async->num_pending))
-		complete(async->done);
-
-	kfree(async);
-}
-
-static int merge_reloc_roots(struct reloc_control *rc)
-{
-	struct async_merge *async;
-	struct btrfs_root *root;
-	struct completion done;
-	atomic_t num_pending;
-
-	init_completion(&done);
-	atomic_set(&num_pending, 1);
-
-	while (!list_empty(&rc->reloc_roots)) {
-		root = list_entry(rc->reloc_roots.next,
-				  struct btrfs_root, root_list);
-		list_del_init(&root->root_list);
-
-		async = kmalloc(sizeof(*async), GFP_NOFS);
-		BUG_ON(!async);
-		async->work.func = merge_func;
-		async->work.flags = 0;
-		async->rc = rc;
-		async->root = root;
-		async->done = &done;
-		async->num_pending = &num_pending;
-		atomic_inc(&num_pending);
-		btrfs_queue_worker(&rc->workers, &async->work);
-	}
-
-	if (!atomic_dec_and_test(&num_pending))
-		wait_for_completion(&done);
-
-	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
-	return 0;
-}
-
-static void free_block_list(struct rb_root *blocks)
-{
-	struct tree_block *block;
-	struct rb_node *rb_node;
-	while ((rb_node = rb_first(blocks))) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
-		rb_erase(rb_node, blocks);
-		kfree(block);
-	}
-}
-
-static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *reloc_root)
-{
-	struct btrfs_root *root;
-
-	if (reloc_root->last_trans == trans->transid)
-		return 0;
-
-	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
-	BUG_ON(IS_ERR(root));
-	BUG_ON(root->reloc_root != reloc_root);
-
-	return btrfs_record_root_in_trans(trans, root);
-}
-
-/*
- * select one tree from trees that references the block.
- * for blocks in refernce counted trees, we preper reloc tree.
- * if no reloc tree found and reloc_only is true, NULL is returned.
- */
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
-					    struct backref_node *node,
-					    struct backref_edge *edges[],
-					    int *nr, int reloc_only)
-{
-	struct backref_node *next;
-	struct btrfs_root *root;
-	int index;
-	int loop = 0;
-again:
-	index = 0;
-	next = node;
-	while (1) {
-		cond_resched();
-		next = walk_up_backref(next, edges, &index);
-		root = next->root;
-		if (!root) {
-			BUG_ON(!node->old_root);
-			goto skip;
-		}
-
-		/* no other choice for non-refernce counted tree */
-		if (!root->ref_cows) {
-			BUG_ON(reloc_only);
-			break;
-		}
-
-		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-			record_reloc_root_in_trans(trans, root);
-			break;
-		}
-
-		if (loop) {
-			btrfs_record_root_in_trans(trans, root);
-			break;
-		}
-
-		if (reloc_only || next != node) {
-			if (!root->reloc_root)
-				btrfs_record_root_in_trans(trans, root);
-			root = root->reloc_root;
-			/*
-			 * if the reloc tree was created in current
-			 * transation, there is no node in backref tree
-			 * corresponds to the root of the reloc tree.
-			 */
-			if (btrfs_root_last_snapshot(&root->root_item) ==
-			    trans->transid - 1)
-				break;
-		}
-skip:
-		root = NULL;
-		next = walk_down_backref(edges, &index);
-		if (!next || next->level <= node->level)
-			break;
-	}
-
-	if (!root && !loop && !reloc_only) {
-		loop = 1;
-		goto again;
-	}
-
-	if (root)
-		*nr = index;
-	else
-		*nr = 0;
-
-	return root;
-}
-
-static noinline_for_stack
-struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
-				   struct backref_node *node)
-{
-	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-	int nr;
-	return __select_one_root(trans, node, edges, &nr, 0);
-}
-
-static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
-				     struct backref_node *node,
-				     struct backref_edge *edges[], int *nr)
-{
-	return __select_one_root(trans, node, edges, nr, 1);
-}
-
-static void grab_path_buffers(struct btrfs_path *path,
-			      struct backref_node *node,
-			      struct backref_edge *edges[], int nr)
-{
-	int i = 0;
-	while (1) {
-		drop_node_buffer(node);
-		node->eb = path->nodes[node->level];
-		BUG_ON(!node->eb);
-		if (path->locks[node->level])
-			node->locked = 1;
-		path->nodes[node->level] = NULL;
-		path->locks[node->level] = 0;
-
-		if (i >= nr)
-			break;
-
-		edges[i]->blockptr = node->eb->start;
-		node = edges[i]->node[UPPER];
-		i++;
-	}
-}
-
-/*
- * relocate a block tree, and then update pointers in upper level
- * blocks that reference the block to point to the new location.
- *
- * if called by link_to_upper, the block has already been relocated.
- * in that case this function just updates pointers.
- */
-static int do_relocation(struct btrfs_trans_handle *trans,
-			 struct backref_node *node,
-			 struct btrfs_key *key,
-			 struct btrfs_path *path, int lowest)
-{
-	struct backref_node *upper;
-	struct backref_edge *edge;
-	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-	struct btrfs_root *root;
-	struct extent_buffer *eb;
-	u32 blocksize;
-	u64 bytenr;
-	u64 generation;
-	int nr;
-	int slot;
-	int ret;
-	int err = 0;
-
-	BUG_ON(lowest && node->eb);
-
-	path->lowest_level = node->level + 1;
-	list_for_each_entry(edge, &node->upper, list[LOWER]) {
-		cond_resched();
-		if (node->eb && node->eb->start == edge->blockptr)
-			continue;
-
-		upper = edge->node[UPPER];
-		root = select_reloc_root(trans, upper, edges, &nr);
-		if (!root)
-			continue;
-
-		if (upper->eb && !upper->locked)
-			drop_node_buffer(upper);
-
-		if (!upper->eb) {
-			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-			BUG_ON(ret > 0);
-
-			slot = path->slots[upper->level];
-
-			btrfs_unlock_up_safe(path, upper->level + 1);
-			grab_path_buffers(path, upper, edges, nr);
-
-			btrfs_release_path(NULL, path);
-		} else {
-			ret = btrfs_bin_search(upper->eb, key, upper->level,
-					       &slot);
-			BUG_ON(ret);
-		}
-
-		bytenr = btrfs_node_blockptr(upper->eb, slot);
-		if (!lowest) {
-			if (node->eb->start == bytenr) {
-				btrfs_tree_unlock(upper->eb);
-				upper->locked = 0;
-				continue;
-			}
-		} else {
-			BUG_ON(node->bytenr != bytenr);
-		}
-
-		blocksize = btrfs_level_size(root, node->level);
-		generation = btrfs_node_ptr_generation(upper->eb, slot);
-		eb = read_tree_block(root, bytenr, blocksize, generation);
-		btrfs_tree_lock(eb);
-		btrfs_set_lock_blocking(eb);
-
-		if (!node->eb) {
-			ret = btrfs_cow_block(trans, root, eb, upper->eb,
-					      slot, &eb);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-			btrfs_set_lock_blocking(eb);
-			node->eb = eb;
-			node->locked = 1;
-		} else {
-			btrfs_set_node_blockptr(upper->eb, slot,
-						node->eb->start);
-			btrfs_set_node_ptr_generation(upper->eb, slot,
-						      trans->transid);
-			btrfs_mark_buffer_dirty(upper->eb);
-
-			ret = btrfs_inc_extent_ref(trans, root,
-						node->eb->start, blocksize,
-						upper->eb->start,
-						btrfs_header_owner(upper->eb),
-						node->level, 0);
-			BUG_ON(ret);
-
-			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
-			BUG_ON(ret);
-
-			btrfs_tree_unlock(eb);
-			free_extent_buffer(eb);
-		}
-		if (!lowest) {
-			btrfs_tree_unlock(upper->eb);
-			upper->locked = 0;
-		}
-	}
-	path->lowest_level = 0;
-	return err;
-}
-
-static int link_to_upper(struct btrfs_trans_handle *trans,
-			 struct backref_node *node,
-			 struct btrfs_path *path)
-{
-	struct btrfs_key key;
-	if (!node->eb || list_empty(&node->upper))
-		return 0;
-
-	btrfs_node_key_to_cpu(node->eb, &key, 0);
-	return do_relocation(trans, node, &key, path, 0);
-}
-
-static int finish_pending_nodes(struct btrfs_trans_handle *trans,
-				struct backref_cache *cache,
-				struct btrfs_path *path)
-{
-	struct backref_node *node;
-	int level;
-	int ret;
-	int err = 0;
-
-	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-		while (!list_empty(&cache->pending[level])) {
-			node = list_entry(cache->pending[level].next,
-					  struct backref_node, lower);
-			BUG_ON(node->level != level);
-
-			ret = link_to_upper(trans, node, path);
-			if (ret < 0)
-				err = ret;
-			/*
-			 * this remove the node from the pending list and
-			 * may add some other nodes to the level + 1
-			 * pending list
-			 */
-			remove_backref_node(cache, node);
-		}
-	}
-	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
-	return err;
-}
-
-static void mark_block_processed(struct reloc_control *rc,
-				 struct backref_node *node)
-{
-	u32 blocksize;
-	if (node->level == 0 ||
-	    in_block_group(node->bytenr, rc->block_group)) {
-		blocksize = btrfs_level_size(rc->extent_root, node->level);
-		set_extent_bits(&rc->processed_blocks, node->bytenr,
-				node->bytenr + blocksize - 1, EXTENT_DIRTY,
-				GFP_NOFS);
-	}
-	node->processed = 1;
-}
-
-/*
- * mark a block and all blocks directly/indirectly reference the block
- * as processed.
- */
-static void update_processed_blocks(struct reloc_control *rc,
-				    struct backref_node *node)
-{
-	struct backref_node *next = node;
-	struct backref_edge *edge;
-	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-	int index = 0;
-
-	while (next) {
-		cond_resched();
-		while (1) {
-			if (next->processed)
-				break;
-
-			mark_block_processed(rc, next);
-
-			if (list_empty(&next->upper))
-				break;
-
-			edge = list_entry(next->upper.next,
-					  struct backref_edge, list[LOWER]);
-			edges[index++] = edge;
-			next = edge->node[UPPER];
-		}
-		next = walk_down_backref(edges, &index);
-	}
-}
-
-static int tree_block_processed(u64 bytenr, u32 blocksize,
-				struct reloc_control *rc)
-{
-	if (test_range_bit(&rc->processed_blocks, bytenr,
-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
-		return 1;
-	return 0;
-}
-
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
-			      u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
-	struct btrfs_key found_key;
-	struct btrfs_file_extent_item *fi;
-	struct extent_buffer *leaf;
-	u32 nritems;
-	int i;
-	int ret = 0;
-
-	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-
-	nritems = btrfs_header_nritems(leaf);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		btrfs_item_key_to_cpu(leaf, &found_key, i);
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			continue;
-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			continue;
-		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-		if (bytenr == 0)
-			continue;
-		if (in_block_group(bytenr, rc->block_group)) {
-			ret = 1;
-			break;
-		}
-	}
-	free_extent_buffer(leaf);
-	return ret;
-}
-
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
-			    struct reloc_control *rc,
-			    struct backref_node *node,
-			    struct rb_root *blocks)
-{
-	struct tree_block *block;
-	struct rb_node *rb_node;
-	u64 bytenr;
-	u64 ptr_gen;
-	u32 blocksize;
-	u32 nritems;
-	int i;
-	int err = 0;
-
-	nritems = btrfs_header_nritems(node->eb);
-	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		bytenr = btrfs_node_blockptr(node->eb, i);
-		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-		if (ptr_gen == trans->transid)
-			continue;
-		if (!in_block_group(bytenr, rc->block_group) &&
-		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-			continue;
-		if (tree_block_processed(bytenr, blocksize, rc))
-			continue;
-
-		readahead_tree_block(rc->extent_root,
-				     bytenr, blocksize, ptr_gen);
-	}
-
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		bytenr = btrfs_node_blockptr(node->eb, i);
-		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-		if (ptr_gen == trans->transid)
-			continue;
-		if (!in_block_group(bytenr, rc->block_group) &&
-		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-			continue;
-		if (tree_block_processed(bytenr, blocksize, rc))
-			continue;
-		if (!in_block_group(bytenr, rc->block_group) &&
-		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
-			continue;
-
-		block = kmalloc(sizeof(*block), GFP_NOFS);
-		if (!block) {
-			err = -ENOMEM;
-			break;
-		}
-		block->bytenr = bytenr;
-		btrfs_node_key_to_cpu(node->eb, &block->key, i);
-		block->level = node->level - 1;
-		block->key_ready = 1;
-		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-		BUG_ON(rb_node);
-	}
-	if (err)
-		free_block_list(blocks);
-	return err;
-}
-
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
-			struct reloc_control *rc,
-			struct backref_cache *cache,
-			struct rb_root *blocks, int level,
-			struct backref_node **upper)
-{
-	struct backref_node *node;
-	int ret = 0;
-
-	WARN_ON(!list_empty(&cache->pending[level]));
-
-	if (list_empty(&cache->pending[level + 1]))
-		return 1;
-
-	node = list_entry(cache->pending[level + 1].next,
-			  struct backref_node, lower);
-	if (node->eb)
-		ret = add_child_blocks(trans, rc, node, blocks);
-
-	*upper = node;
-	return ret;
-}
-
-static int get_tree_block_key(struct reloc_control *rc,
-			      struct tree_block *block)
-{
-	struct extent_buffer *eb;
-
-	BUG_ON(block->key_ready);
-	eb = read_tree_block(rc->extent_root, block->bytenr,
-			     block->key.objectid, block->key.offset);
-	WARN_ON(btrfs_header_level(eb) != block->level);
-	if (block->level == 0)
-		btrfs_item_key_to_cpu(eb, &block->key, 0);
-	else
-		btrfs_node_key_to_cpu(eb, &block->key, 0);
-	free_extent_buffer(eb);
-	block->key_ready = 1;
-	return 0;
-}
-
-static int reada_tree_block(struct reloc_control *rc,
-			    struct tree_block *block)
-{
-	BUG_ON(block->key_ready);
-	readahead_tree_block(rc->extent_root, block->bytenr,
-			     block->key.objectid, block->key.offset);
-	return 0;
-}
-
-/*
- * helper function to relocate a tree block
- */
-static int relocate_tree_block(struct btrfs_trans_handle *trans,
-				struct reloc_control *rc,
-				struct backref_node *node,
-				struct btrfs_key *key,
-				struct btrfs_path *path)
-{
-	struct btrfs_root *root;
-	int ret;
-
-	root = select_one_root(trans, node);
-	if (unlikely(!root)) {
-		rc->found_old_snapshot = 1;
-		update_processed_blocks(rc, node);
-		return 0;
-	}
-
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		ret = do_relocation(trans, node, key, path, 1);
-		if (ret < 0)
-			goto out;
-		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-			ret = replace_file_extents(trans, rc, root,
-						   node->eb, NULL);
-			if (ret < 0)
-				goto out;
-		}
-		drop_node_buffer(node);
-	} else if (!root->ref_cows) {
-		path->lowest_level = node->level;
-		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-		btrfs_release_path(root, path);
-		if (ret < 0)
-			goto out;
-	} else if (root != node->root) {
-		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
-	}
-
-	update_processed_blocks(rc, node);
-	ret = 0;
-out:
-	drop_node_buffer(node);
-	return ret;
-}
-
-/*
- * relocate a list of blocks
- */
-static noinline_for_stack
-int relocate_tree_blocks(struct btrfs_trans_handle *trans,
-			 struct reloc_control *rc, struct rb_root *blocks)
-{
-	struct backref_cache *cache;
-	struct backref_node *node;
-	struct btrfs_path *path;
-	struct tree_block *block;
-	struct rb_node *rb_node;
-	int level = -1;
-	int ret;
-	int err = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	cache = kmalloc(sizeof(*cache), GFP_NOFS);
-	if (!cache) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
-
-	backref_cache_init(cache);
-
-	rb_node = rb_first(blocks);
-	while (rb_node) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
-		if (level == -1)
-			level = block->level;
-		else
-			BUG_ON(level != block->level);
-		if (!block->key_ready)
-			reada_tree_block(rc, block);
-		rb_node = rb_next(rb_node);
-	}
-
-	rb_node = rb_first(blocks);
-	while (rb_node) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
-		if (!block->key_ready)
-			get_tree_block_key(rc, block);
-		rb_node = rb_next(rb_node);
-	}
-
-	rb_node = rb_first(blocks);
-	while (rb_node) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
-
-		node = build_backref_tree(rc, cache, &block->key,
-					  block->level, block->bytenr);
-		if (IS_ERR(node)) {
-			err = PTR_ERR(node);
-			goto out;
-		}
-
-		ret = relocate_tree_block(trans, rc, node, &block->key,
-					  path);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		remove_backref_node(cache, node);
-		rb_node = rb_next(rb_node);
-	}
-
-	if (level > 0)
-		goto out;
-
-	free_block_list(blocks);
-
-	/*
-	 * now backrefs of some upper level tree blocks have been cached,
-	 * try relocating blocks referenced by these upper level blocks.
-	 */
-	while (1) {
-		struct backref_node *upper = NULL;
-		if (trans->transaction->in_commit ||
-		    trans->transaction->delayed_refs.flushing)
-			break;
-
-		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
-					  &upper);
-		if (ret < 0)
-			err = ret;
-		if (ret != 0)
-			break;
-
-		rb_node = rb_first(blocks);
-		while (rb_node) {
-			block = rb_entry(rb_node, struct tree_block, rb_node);
-			if (trans->transaction->in_commit ||
-			    trans->transaction->delayed_refs.flushing)
-				goto out;
-			BUG_ON(!block->key_ready);
-			node = build_backref_tree(rc, cache, &block->key,
-						  level, block->bytenr);
-			if (IS_ERR(node)) {
-				err = PTR_ERR(node);
-				goto out;
-			}
-
-			ret = relocate_tree_block(trans, rc, node,
-						  &block->key, path);
-			if (ret < 0) {
-				err = ret;
-				goto out;
-			}
-			remove_backref_node(cache, node);
-			rb_node = rb_next(rb_node);
-		}
-		free_block_list(blocks);
-
-		if (upper) {
-			ret = link_to_upper(trans, upper, path);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-			remove_backref_node(cache, upper);
-		}
-	}
-out:
-	free_block_list(blocks);
-
-	ret = finish_pending_nodes(trans, cache, path);
-	if (ret < 0)
-		err = ret;
-
-	kfree(cache);
-	btrfs_free_path(path);
-	return err;
-}
-
-static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
-{
-	u64 page_start;
-	u64 page_end;
-	unsigned long i;
-	unsigned long first_index;
-	unsigned long last_index;
-	unsigned int total_read = 0;
-	unsigned int total_dirty = 0;
-	struct page *page;
-	struct file_ra_state *ra;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	int ret = 0;
-
-	ra = kzalloc(sizeof(*ra), GFP_NOFS);
-	if (!ra)
-		return -ENOMEM;
-
-	mutex_lock(&inode->i_mutex);
-	first_index = start >> PAGE_CACHE_SHIFT;
-	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
-
-	/* make sure the dirty trick played by the caller work */
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    first_index, last_index);
-	if (ret)
-		goto out_unlock;
-
-	file_ra_state_init(ra, inode->i_mapping);
-
-	for (i = first_index ; i <= last_index; i++) {
-		if (total_read % ra->ra_pages == 0) {
-			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-				min(last_index, ra->ra_pages + i - 1));
-		}
-		total_read++;
-again:
-		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-			BUG_ON(1);
-		page = grab_cache_page(inode->i_mapping, i);
-		if (!page) {
-			ret = -ENOMEM;
-			goto out_unlock;
-		}
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				ret = -EIO;
-				goto out_unlock;
-			}
-		}
-		wait_on_page_writeback(page);
-
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-		page_end = page_start + PAGE_CACHE_SIZE - 1;
-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
-		if (ordered) {
-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-			unlock_page(page);
-			page_cache_release(page);
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			goto again;
-		}
-		set_page_extent_mapped(page);
-
-		if (i == first_index)
-			set_extent_bits(io_tree, page_start, page_end,
-					EXTENT_BOUNDARY, GFP_NOFS);
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
-
-		set_page_dirty(page);
-		total_dirty++;
-
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		unlock_page(page);
-		page_cache_release(page);
-	}
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
-	kfree(ra);
-	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
-	return ret;
-}
-
-static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_map *em;
-	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
-	u64 end = start + extent_key->offset - 1;
-
-	em = alloc_extent_map(GFP_NOFS);
-	em->start = start;
-	em->len = extent_key->offset;
-	em->block_len = extent_key->offset;
-	em->block_start = extent_key->objectid;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
-	/* setup extent map to cheat btrfs_readpage */
-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-	while (1) {
-		int ret;
-		spin_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		spin_unlock(&em_tree->lock);
-		if (ret != -EEXIST) {
-			free_extent_map(em);
-			break;
-		}
-		btrfs_drop_extent_cache(inode, start, end, 0);
-	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-
-	return relocate_inode_pages(inode, start, extent_key->offset);
-}
-
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-static int get_ref_objectid_v0(struct reloc_control *rc,
-			       struct btrfs_path *path,
-			       struct btrfs_key *extent_key,
-			       u64 *ref_objectid, int *path_change)
-{
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_ref_v0 *ref0;
-	int ret;
-	int slot;
-
-	leaf = path->nodes[0];
-	slot = path->slots[0];
-	while (1) {
-		if (slot >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(rc->extent_root, path);
-			if (ret < 0)
-				return ret;
-			BUG_ON(ret > 0);
-			leaf = path->nodes[0];
-			slot = path->slots[0];
-			if (path_change)
-				*path_change = 1;
-		}
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid != extent_key->objectid)
-			return -ENOENT;
-
-		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
-			slot++;
-			continue;
-		}
-		ref0 = btrfs_item_ptr(leaf, slot,
-				struct btrfs_extent_ref_v0);
-		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
-		break;
-	}
-	return 0;
-}
-#endif
-
-/*
- * helper to add a tree block to the list.
- * the major work is getting the generation and level of the block
- */
-static int add_tree_block(struct reloc_control *rc,
-			  struct btrfs_key *extent_key,
-			  struct btrfs_path *path,
-			  struct rb_root *blocks)
-{
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_tree_block_info *bi;
-	struct tree_block *block;
-	struct rb_node *rb_node;
-	u32 item_size;
-	int level = -1;
-	int generation;
-
-	eb =  path->nodes[0];
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
-		ei = btrfs_item_ptr(eb, path->slots[0],
-				struct btrfs_extent_item);
-		bi = (struct btrfs_tree_block_info *)(ei + 1);
-		generation = btrfs_extent_generation(eb, ei);
-		level = btrfs_tree_block_level(eb, bi);
-	} else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		u64 ref_owner;
-		int ret;
-
-		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
-		ret = get_ref_objectid_v0(rc, path, extent_key,
-					  &ref_owner, NULL);
-		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
-		level = (int)ref_owner;
-		/* FIXME: get real generation */
-		generation = 0;
-#else
-		BUG();
-#endif
-	}
-
-	btrfs_release_path(rc->extent_root, path);
-
-	BUG_ON(level == -1);
-
-	block = kmalloc(sizeof(*block), GFP_NOFS);
-	if (!block)
-		return -ENOMEM;
-
-	block->bytenr = extent_key->objectid;
-	block->key.objectid = extent_key->offset;
-	block->key.offset = generation;
-	block->level = level;
-	block->key_ready = 0;
-
-	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-	BUG_ON(rb_node);
-
-	return 0;
-}
-
-/*
- * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
- */
-static int __add_tree_block(struct reloc_control *rc,
-			    u64 bytenr, u32 blocksize,
-			    struct rb_root *blocks)
-{
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	int ret;
-
-	if (tree_block_processed(bytenr, blocksize, rc))
-		return 0;
-
-	if (tree_search(blocks, bytenr))
-		return 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = blocksize;
-
-	path->search_commit_root = 1;
-	path->skip_locking = 1;
-	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-	BUG_ON(ret);
-
-	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-	ret = add_tree_block(rc, &key, path, blocks);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
- * helper to check if the block use full backrefs for pointers in it
- */
-static int block_use_full_backref(struct reloc_control *rc,
-				  struct extent_buffer *eb)
-{
-	struct btrfs_path *path;
-	struct btrfs_extent_item *ei;
-	struct btrfs_key key;
-	u64 flags;
-	int ret;
-
-	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
-	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
-		return 1;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	key.objectid = eb->start;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = eb->len;
-
-	path->search_commit_root = 1;
-	path->skip_locking = 1;
-	ret = btrfs_search_slot(NULL, rc->extent_root,
-				&key, path, 0, 0);
-	BUG_ON(ret);
-
-	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			    struct btrfs_extent_item);
-	flags = btrfs_extent_flags(path->nodes[0], ei);
-	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
-	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-		ret = 1;
-	else
-		ret = 0;
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
- * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
- * this function scans fs tree to find blocks reference the data extent
- */
-static int find_data_references(struct reloc_control *rc,
-				struct btrfs_key *extent_key,
-				struct extent_buffer *leaf,
-				struct btrfs_extent_data_ref *ref,
-				struct rb_root *blocks)
-{
-	struct btrfs_path *path;
-	struct tree_block *block;
-	struct btrfs_root *root;
-	struct btrfs_file_extent_item *fi;
-	struct rb_node *rb_node;
-	struct btrfs_key key;
-	u64 ref_root;
-	u64 ref_objectid;
-	u64 ref_offset;
-	u32 ref_count;
-	u32 nritems;
-	int err = 0;
-	int added = 0;
-	int counted;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	ref_root = btrfs_extent_data_ref_root(leaf, ref);
-	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
-	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
-	ref_count = btrfs_extent_data_ref_count(leaf, ref);
-
-	root = read_fs_root(rc->extent_root->fs_info, ref_root);
-	if (IS_ERR(root)) {
-		err = PTR_ERR(root);
-		goto out;
-	}
-
-	key.objectid = ref_objectid;
-	key.offset = ref_offset;
-	key.type = BTRFS_EXTENT_DATA_KEY;
-
-	path->search_commit_root = 1;
-	path->skip_locking = 1;
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-
-	leaf = path->nodes[0];
-	nritems = btrfs_header_nritems(leaf);
-	/*
-	 * the references in tree blocks that use full backrefs
-	 * are not counted in
-	 */
-	if (block_use_full_backref(rc, leaf))
-		counted = 0;
-	else
-		counted = 1;
-	rb_node = tree_search(blocks, leaf->start);
-	if (rb_node) {
-		if (counted)
-			added = 1;
-		else
-			path->slots[0] = nritems;
-	}
-
-	while (ref_count > 0) {
-		while (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0) {
-				err = ret;
-				goto out;
-			}
-			if (ret > 0) {
-				WARN_ON(1);
-				goto out;
-			}
-
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-			added = 0;
-
-			if (block_use_full_backref(rc, leaf))
-				counted = 0;
-			else
-				counted = 1;
-			rb_node = tree_search(blocks, leaf->start);
-			if (rb_node) {
-				if (counted)
-					added = 1;
-				else
-					path->slots[0] = nritems;
-			}
-		}
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid != ref_objectid ||
-		    key.type != BTRFS_EXTENT_DATA_KEY) {
-			WARN_ON(1);
-			break;
-		}
-
-		fi = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			goto next;
-
-		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
-		    extent_key->objectid)
-			goto next;
-
-		key.offset -= btrfs_file_extent_offset(leaf, fi);
-		if (key.offset != ref_offset)
-			goto next;
-
-		if (counted)
-			ref_count--;
-		if (added)
-			goto next;
-
-		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
-			block = kmalloc(sizeof(*block), GFP_NOFS);
-			if (!block) {
-				err = -ENOMEM;
-				break;
-			}
-			block->bytenr = leaf->start;
-			btrfs_item_key_to_cpu(leaf, &block->key, 0);
-			block->level = 0;
-			block->key_ready = 1;
-			rb_node = tree_insert(blocks, block->bytenr,
-					      &block->rb_node);
-			BUG_ON(rb_node);
-		}
-		if (counted)
-			added = 1;
-		else
-			path->slots[0] = nritems;
-next:
-		path->slots[0]++;
-
-	}
-out:
-	btrfs_free_path(path);
-	return err;
-}
-
-/*
- * hepler to find all tree blocks that reference a given data extent
- */
-static noinline_for_stack
-int add_data_references(struct reloc_control *rc,
-			struct btrfs_key *extent_key,
-			struct btrfs_path *path,
-			struct rb_root *blocks)
-{
-	struct btrfs_key key;
-	struct extent_buffer *eb;
-	struct btrfs_extent_data_ref *dref;
-	struct btrfs_extent_inline_ref *iref;
-	unsigned long ptr;
-	unsigned long end;
-	u32 blocksize;
-	int ret;
-	int err = 0;
-
-	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
-			       extent_key->offset);
-	BUG_ON(ret < 0);
-	if (ret > 0) {
-		/* the relocated data is fragmented */
-		rc->extents_skipped++;
-		btrfs_release_path(rc->extent_root, path);
-		return 0;
-	}
-
-	blocksize = btrfs_level_size(rc->extent_root, 0);
-
-	eb = path->nodes[0];
-	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
-	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
-		ptr = end;
-	else
-#endif
-		ptr += sizeof(struct btrfs_extent_item);
-
-	while (ptr < end) {
-		iref = (struct btrfs_extent_inline_ref *)ptr;
-		key.type = btrfs_extent_inline_ref_type(eb, iref);
-		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
-			ret = __add_tree_block(rc, key.offset, blocksize,
-					       blocks);
-		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
-			ret = find_data_references(rc, extent_key,
-						   eb, dref, blocks);
-		} else {
-			BUG();
-		}
-		ptr += btrfs_extent_inline_ref_size(key.type);
-	}
-	WARN_ON(ptr > end);
-
-	while (1) {
-		cond_resched();
-		eb = path->nodes[0];
-		if (path->slots[0] >= btrfs_header_nritems(eb)) {
-			ret = btrfs_next_leaf(rc->extent_root, path);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-			if (ret > 0)
-				break;
-			eb = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-		if (key.objectid != extent_key->objectid)
-			break;
-
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
-		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-#else
-		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
-		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-#endif
-			ret = __add_tree_block(rc, key.offset, blocksize,
-					       blocks);
-		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = btrfs_item_ptr(eb, path->slots[0],
-					      struct btrfs_extent_data_ref);
-			ret = find_data_references(rc, extent_key,
-						   eb, dref, blocks);
-		} else {
-			ret = 0;
-		}
-		if (ret) {
-			err = ret;
-			break;
-		}
-		path->slots[0]++;
-	}
-	btrfs_release_path(rc->extent_root, path);
-	if (err)
-		free_block_list(blocks);
-	return err;
-}
-
-/*
- * hepler to find next unprocessed extent
- */
-static noinline_for_stack
-int find_next_extent(struct btrfs_trans_handle *trans,
-		     struct reloc_control *rc, struct btrfs_path *path)
-{
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	u64 start, end, last;
-	int ret;
-
-	last = rc->block_group->key.objectid + rc->block_group->key.offset;
-	while (1) {
-		cond_resched();
-		if (rc->search_start >= last) {
-			ret = 1;
-			break;
-		}
-
-		key.objectid = rc->search_start;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = 0;
-
-		path->search_commit_root = 1;
-		path->skip_locking = 1;
-		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
-					0, 0);
-		if (ret < 0)
-			break;
-next:
-		leaf = path->nodes[0];
-		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(rc->extent_root, path);
-			if (ret != 0)
-				break;
-			leaf = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid >= last) {
-			ret = 1;
-			break;
-		}
-
-		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
-		    key.objectid + key.offset <= rc->search_start) {
-			path->slots[0]++;
-			goto next;
-		}
-
-		ret = find_first_extent_bit(&rc->processed_blocks,
-					    key.objectid, &start, &end,
-					    EXTENT_DIRTY);
-
-		if (ret == 0 && start <= key.objectid) {
-			btrfs_release_path(rc->extent_root, path);
-			rc->search_start = end + 1;
-		} else {
-			rc->search_start = key.objectid + key.offset;
-			return 0;
-		}
-	}
-	btrfs_release_path(rc->extent_root, path);
-	return ret;
-}
-
-static void set_reloc_control(struct reloc_control *rc)
-{
-	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	mutex_lock(&fs_info->trans_mutex);
-	fs_info->reloc_ctl = rc;
-	mutex_unlock(&fs_info->trans_mutex);
-}
-
-static void unset_reloc_control(struct reloc_control *rc)
-{
-	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	mutex_lock(&fs_info->trans_mutex);
-	fs_info->reloc_ctl = NULL;
-	mutex_unlock(&fs_info->trans_mutex);
-}
-
-static int check_extent_flags(u64 flags)
-{
-	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
-	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
-		return 1;
-	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
-	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
-		return 1;
-	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
-	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
-		return 1;
-	return 0;
-}
-
-static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
-{
-	struct rb_root blocks = RB_ROOT;
-	struct btrfs_key key;
-	struct btrfs_trans_handle *trans = NULL;
-	struct btrfs_path *path;
-	struct btrfs_extent_item *ei;
-	unsigned long nr;
-	u64 flags;
-	u32 item_size;
-	int ret;
-	int err = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	rc->search_start = rc->block_group->key.objectid;
-	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-			  GFP_NOFS);
-
-	rc->create_reloc_root = 1;
-	set_reloc_control(rc);
-
-	trans = btrfs_start_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
-
-	while (1) {
-		trans = btrfs_start_transaction(rc->extent_root, 1);
-
-		ret = find_next_extent(trans, rc, path);
-		if (ret < 0)
-			err = ret;
-		if (ret != 0)
-			break;
-
-		rc->extents_found++;
-
-		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				    struct btrfs_extent_item);
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		item_size = btrfs_item_size_nr(path->nodes[0],
-					       path->slots[0]);
-		if (item_size >= sizeof(*ei)) {
-			flags = btrfs_extent_flags(path->nodes[0], ei);
-			ret = check_extent_flags(flags);
-			BUG_ON(ret);
-
-		} else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-			u64 ref_owner;
-			int path_change = 0;
-
-			BUG_ON(item_size !=
-			       sizeof(struct btrfs_extent_item_v0));
-			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
-						  &path_change);
-			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
-				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
-			else
-				flags = BTRFS_EXTENT_FLAG_DATA;
-
-			if (path_change) {
-				btrfs_release_path(rc->extent_root, path);
-
-				path->search_commit_root = 1;
-				path->skip_locking = 1;
-				ret = btrfs_search_slot(NULL, rc->extent_root,
-							&key, path, 0, 0);
-				if (ret < 0) {
-					err = ret;
-					break;
-				}
-				BUG_ON(ret > 0);
-			}
-#else
-			BUG();
-#endif
-		}
-
-		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-			ret = add_tree_block(rc, &key, path, &blocks);
-		} else if (rc->stage == UPDATE_DATA_PTRS &&
-			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
-			ret = add_data_references(rc, &key, path, &blocks);
-		} else {
-			btrfs_release_path(rc->extent_root, path);
-			ret = 0;
-		}
-		if (ret < 0) {
-			err = 0;
-			break;
-		}
-
-		if (!RB_EMPTY_ROOT(&blocks)) {
-			ret = relocate_tree_blocks(trans, rc, &blocks);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-		}
-
-		nr = trans->blocks_used;
-		btrfs_end_transaction_throttle(trans, rc->extent_root);
-		trans = NULL;
-		btrfs_btree_balance_dirty(rc->extent_root, nr);
-
-		if (rc->stage == MOVE_DATA_EXTENTS &&
-		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
-			rc->found_file_extent = 1;
-			ret = relocate_data_extent(rc->data_inode, &key);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-		}
-	}
-	btrfs_free_path(path);
-
-	if (trans) {
-		nr = trans->blocks_used;
-		btrfs_end_transaction(trans, rc->extent_root);
-		btrfs_btree_balance_dirty(rc->extent_root, nr);
-	}
-
-	rc->create_reloc_root = 0;
-	smp_mb();
-
-	if (rc->extents_found > 0) {
-		trans = btrfs_start_transaction(rc->extent_root, 1);
-		btrfs_commit_transaction(trans, rc->extent_root);
-	}
-
-	merge_reloc_roots(rc);
-
-	unset_reloc_control(rc);
-
-	/* get rid of pinned extents */
-	trans = btrfs_start_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
-
-	return err;
-}
-
-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 u64 objectid, u64 size)
-{
-	struct btrfs_path *path;
-	struct btrfs_inode_item *item;
-	struct extent_buffer *leaf;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
-	if (ret)
-		goto out;
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
-	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
-	btrfs_set_inode_generation(leaf, item, 1);
-	btrfs_set_inode_size(leaf, item, size);
-	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
-	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root, path);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
- * helper to create inode for data relocation.
- * the inode is in data relocation tree and its link count is 0
- */
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
-					struct btrfs_block_group_cache *group)
-{
-	struct inode *inode = NULL;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
-	struct btrfs_key key;
-	unsigned long nr;
-	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
-	int err = 0;
-
-	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
-	if (IS_ERR(root))
-		return ERR_CAST(root);
-
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-
-	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
-	if (err)
-		goto out;
-
-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
-	BUG_ON(err);
-
-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-				       group->key.offset, 0, group->key.offset,
-				       0, 0, 0);
-	BUG_ON(err);
-
-	key.objectid = objectid;
-	key.type = BTRFS_INODE_ITEM_KEY;
-	key.offset = 0;
-	inode = btrfs_iget(root->fs_info->sb, &key, root);
-	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
-	BTRFS_I(inode)->index_cnt = group->key.objectid;
-
-	err = btrfs_orphan_add(trans, inode);
-out:
-	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
-
-	btrfs_btree_balance_dirty(root, nr);
-	if (err) {
-		if (inode)
-			iput(inode);
-		inode = ERR_PTR(err);
-	}
-	return inode;
-}
-
-/*
- * function to relocate all extents in a block group.
- */
-int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
-{
-	struct btrfs_fs_info *fs_info = extent_root->fs_info;
-	struct reloc_control *rc;
-	int ret;
-	int err = 0;
-
-	rc = kzalloc(sizeof(*rc), GFP_NOFS);
-	if (!rc)
-		return -ENOMEM;
-
-	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
-	INIT_LIST_HEAD(&rc->reloc_roots);
-
-	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
-	BUG_ON(!rc->block_group);
-
-	btrfs_init_workers(&rc->workers, "relocate",
-			   fs_info->thread_pool_size);
-
-	rc->extent_root = extent_root;
-	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
-
-	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
-	if (IS_ERR(rc->data_inode)) {
-		err = PTR_ERR(rc->data_inode);
-		rc->data_inode = NULL;
-		goto out;
-	}
-
-	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
-	       (unsigned long long)rc->block_group->key.objectid,
-	       (unsigned long long)rc->block_group->flags);
-
-	btrfs_start_delalloc_inodes(fs_info->tree_root);
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
-
-	while (1) {
-		mutex_lock(&fs_info->cleaner_mutex);
-		btrfs_clean_old_snapshots(fs_info->tree_root);
-		mutex_unlock(&fs_info->cleaner_mutex);
-
-		rc->extents_found = 0;
-		rc->extents_skipped = 0;
-
-		ret = relocate_block_group(rc);
-		if (ret < 0) {
-			err = ret;
-			break;
-		}
-
-		if (rc->extents_found == 0)
-			break;
-
-		printk(KERN_INFO "btrfs: found %llu extents\n",
-			(unsigned long long)rc->extents_found);
-
-		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
-			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
-			invalidate_mapping_pages(rc->data_inode->i_mapping,
-						 0, -1);
-			rc->stage = UPDATE_DATA_PTRS;
-		} else if (rc->stage == UPDATE_DATA_PTRS &&
-			   rc->extents_skipped >= rc->extents_found) {
-			iput(rc->data_inode);
-			rc->data_inode = create_reloc_inode(fs_info,
-							    rc->block_group);
-			if (IS_ERR(rc->data_inode)) {
-				err = PTR_ERR(rc->data_inode);
-				rc->data_inode = NULL;
-				break;
-			}
-			rc->stage = MOVE_DATA_EXTENTS;
-			rc->found_file_extent = 0;
-		}
-	}
-
-	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
-				 rc->block_group->key.objectid,
-				 rc->block_group->key.objectid +
-				 rc->block_group->key.offset - 1);
-
-	WARN_ON(rc->block_group->pinned > 0);
-	WARN_ON(rc->block_group->reserved > 0);
-	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
-out:
-	iput(rc->data_inode);
-	btrfs_stop_workers(&rc->workers);
-	btrfs_put_block_group(rc->block_group);
-	kfree(rc);
-	return err;
-}
-
-/*
- * recover relocation interrupted by system crash.
- *
- * this function resumes merging reloc trees with corresponding fs trees.
- * this is important for keeping the sharing of tree blocks
- */
-int btrfs_recover_relocation(struct btrfs_root *root)
-{
-	LIST_HEAD(reloc_roots);
-	struct btrfs_key key;
-	struct btrfs_root *fs_root;
-	struct btrfs_root *reloc_root;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct reloc_control *rc = NULL;
-	struct btrfs_trans_handle *trans;
-	int ret;
-	int err = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
-	key.type = BTRFS_ROOT_ITEM_KEY;
-	key.offset = (u64)-1;
-
-	while (1) {
-		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
-					path, 0, 0);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		if (ret > 0) {
-			if (path->slots[0] == 0)
-				break;
-			path->slots[0]--;
-		}
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		btrfs_release_path(root->fs_info->tree_root, path);
-
-		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
-		    key.type != BTRFS_ROOT_ITEM_KEY)
-			break;
-
-		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
-		if (IS_ERR(reloc_root)) {
-			err = PTR_ERR(reloc_root);
-			goto out;
-		}
-
-		list_add(&reloc_root->root_list, &reloc_roots);
-
-		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-			fs_root = read_fs_root(root->fs_info,
-					       reloc_root->root_key.offset);
-			if (IS_ERR(fs_root)) {
-				err = PTR_ERR(fs_root);
-				goto out;
-			}
-		}
-
-		if (key.offset == 0)
-			break;
-
-		key.offset--;
-	}
-	btrfs_release_path(root->fs_info->tree_root, path);
-
-	if (list_empty(&reloc_roots))
-		goto out;
-
-	rc = kzalloc(sizeof(*rc), GFP_NOFS);
-	if (!rc) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	mapping_tree_init(&rc->reloc_root_tree);
-	INIT_LIST_HEAD(&rc->reloc_roots);
-	btrfs_init_workers(&rc->workers, "relocate",
-			   root->fs_info->thread_pool_size);
-	rc->extent_root = root->fs_info->extent_root;
-
-	set_reloc_control(rc);
-
-	while (!list_empty(&reloc_roots)) {
-		reloc_root = list_entry(reloc_roots.next,
-					struct btrfs_root, root_list);
-		list_del(&reloc_root->root_list);
-
-		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
-			list_add_tail(&reloc_root->root_list,
-				      &rc->reloc_roots);
-			continue;
-		}
-
-		fs_root = read_fs_root(root->fs_info,
-				       reloc_root->root_key.offset);
-		BUG_ON(IS_ERR(fs_root));
-
-		__add_reloc_root(reloc_root);
-		fs_root->reloc_root = reloc_root;
-	}
-
-	trans = btrfs_start_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
-
-	merge_reloc_roots(rc);
-
-	unset_reloc_control(rc);
-
-	trans = btrfs_start_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
-out:
-	if (rc) {
-		btrfs_stop_workers(&rc->workers);
-		kfree(rc);
-	}
-	while (!list_empty(&reloc_roots)) {
-		reloc_root = list_entry(reloc_roots.next,
-					struct btrfs_root, root_list);
-		list_del(&reloc_root->root_list);
-		free_extent_buffer(reloc_root->node);
-		free_extent_buffer(reloc_root->commit_root);
-		kfree(reloc_root);
-	}
-	btrfs_free_path(path);
-
-	if (err == 0) {
-		/* cleanup orphan inode in data relocation tree */
-		fs_root = read_fs_root(root->fs_info,
-				       BTRFS_DATA_RELOC_TREE_OBJECTID);
-		if (IS_ERR(fs_root))
-			err = PTR_ERR(fs_root);
-	}
-	return err;
-}
-
-/*
- * helper to add ordered checksum for data relocation.
- *
- * cloning checksum properly handles the nodatasum extents.
- * it also saves CPU time to re-calculate the checksum.
- */
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
-{
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	struct btrfs_ordered_extent *ordered;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	size_t offset;
-	int ret;
-	u64 disk_bytenr;
-	LIST_HEAD(list);
-
-	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
-	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
-
-	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
-	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-				       disk_bytenr + len - 1, &list);
-
-	while (!list_empty(&list)) {
-		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
-		list_del_init(&sums->list);
-
-		sector_sum = sums->sums;
-		sums->bytenr = ordered->start;
-
-		offset = 0;
-		while (offset < sums->len) {
-			sector_sum->bytenr += ordered->start - disk_bytenr;
-			sector_sum++;
-			offset += root->sectorsize;
-		}
-
-		btrfs_add_ordered_sum(inode, ordered, sums);
-	}
-	btrfs_put_ordered_extent(ordered);
-	return 0;
-}
diff --git a/trunk/fs/btrfs/root-tree.c b/trunk/fs/btrfs/root-tree.c
index 0ddc6d61c55a..b48650de4472 100644
--- a/trunk/fs/btrfs/root-tree.c
+++ b/trunk/fs/btrfs/root-tree.c
@@ -111,15 +111,6 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	return ret;
 }
 
-int btrfs_set_root_node(struct btrfs_root_item *item,
-			struct extent_buffer *node)
-{
-	btrfs_set_root_bytenr(item, node->start);
-	btrfs_set_root_level(item, btrfs_header_level(node));
-	btrfs_set_root_generation(item, btrfs_header_generation(node));
-	return 0;
-}
-
 /*
  * copy the data in 'item' into the btree
  */
@@ -173,7 +164,8 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
  * offset lower than the latest root.  They need to be queued for deletion to
  * finish what was happening when we crashed.
  */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+			  struct btrfs_root *latest)
 {
 	struct btrfs_root *dead_root;
 	struct btrfs_item *item;
@@ -235,7 +227,10 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 			goto err;
 		}
 
-		ret = btrfs_add_dead_root(dead_root);
+		if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+			ret = btrfs_add_dead_reloc_root(dead_root);
+		else
+			ret = btrfs_add_dead_root(dead_root, latest);
 		if (ret)
 			goto err;
 		goto again;
diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c
index 9f179d4832d5..2ff7cd2db25f 100644
--- a/trunk/fs/btrfs/super.c
+++ b/trunk/fs/btrfs/super.c
@@ -52,6 +52,7 @@
 #include "export.h"
 #include "compression.h"
 
+
 static struct super_operations btrfs_super_ops;
 
 static void btrfs_put_super(struct super_block *sb)
@@ -66,8 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+	Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,8 +84,6 @@ static match_table_t tokens = {
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
-	{Opt_ssd_spread, "ssd_spread"},
-	{Opt_nossd, "nossd"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
@@ -159,7 +158,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			 */
 			break;
 		case Opt_nodatasum:
-			printk(KERN_INFO "btrfs: setting nodatasum\n");
+			printk(KERN_INFO "btrfs: setting nodatacsum\n");
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
@@ -175,19 +174,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
 			break;
-		case Opt_ssd_spread:
-			printk(KERN_INFO "btrfs: use spread ssd "
-			       "allocation scheme\n");
-			btrfs_set_opt(info->mount_opt, SSD);
-			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
-			break;
-		case Opt_nossd:
-			printk(KERN_INFO "btrfs: not using ssd allocation "
-			       "scheme\n");
-			btrfs_set_opt(info->mount_opt, NOSSD);
-			btrfs_clear_opt(info->mount_opt, SSD);
-			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
-			break;
 		case Opt_nobarrier:
 			printk(KERN_INFO "btrfs: turning off barriers\n");
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
@@ -336,7 +322,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	struct dentry *root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
-	struct btrfs_key key;
+	struct btrfs_inode *bi;
 	int err;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -355,15 +341,23 @@ static int btrfs_fill_super(struct super_block *sb,
 	}
 	sb->s_fs_info = tree_root;
 	disk_super = &tree_root->fs_info->super_copy;
+	inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
+				  tree_root->fs_info->fs_root);
+	bi = BTRFS_I(inode);
+	bi->location.objectid = inode->i_ino;
+	bi->location.offset = 0;
+	bi->root = tree_root->fs_info->fs_root;
+
+	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
-	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
-	key.type = BTRFS_INODE_ITEM_KEY;
-	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
+	if (!inode) {
+		err = -ENOMEM;
 		goto fail_close;
 	}
+	if (inode->i_state & I_NEW) {
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+	}
 
 	root_dentry = d_alloc_root(inode);
 	if (!root_dentry) {
@@ -394,6 +388,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	sb->s_dirt = 0;
 	if (!wait) {
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
@@ -404,6 +402,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
+	sb->s_dirt = 0;
 	return ret;
 }
 
@@ -434,11 +433,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
 	if (btrfs_test_opt(root, COMPRESS))
 		seq_puts(seq, ",compress");
-	if (btrfs_test_opt(root, NOSSD))
-		seq_puts(seq, ",nossd");
-	if (btrfs_test_opt(root, SSD_SPREAD))
-		seq_puts(seq, ",ssd_spread");
-	else if (btrfs_test_opt(root, SSD))
+	if (btrfs_test_opt(root, SSD))
 		seq_puts(seq, ",ssd");
 	if (btrfs_test_opt(root, NOTREELOG))
 		seq_puts(seq, ",notreelog");
@@ -449,6 +444,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	return 0;
 }
 
+static void btrfs_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+}
+
 static int btrfs_test_super(struct super_block *s, void *data)
 {
 	struct btrfs_fs_devices *test_fs_devices = data;
@@ -584,8 +584,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		/* recover relocation */
-		ret = btrfs_recover_relocation(root);
+		ret = btrfs_cleanup_reloc_trees(root);
 		WARN_ON(ret);
 
 		ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -679,6 +678,7 @@ static int btrfs_unfreeze(struct super_block *sb)
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
 	.put_super	= btrfs_put_super,
+	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
 	.write_inode	= btrfs_write_inode,
diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c
index 2e177d7f4bb9..01b143605ec1 100644
--- a/trunk/fs/btrfs/transaction.c
+++ b/trunk/fs/btrfs/transaction.c
@@ -25,6 +25,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
+#include "ref-cache.h"
 #include "tree-log.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -93,37 +94,45 @@ static noinline int join_transaction(struct btrfs_root *root)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root)
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
-	if (root->ref_cows && root->last_trans < trans->transid) {
+	struct btrfs_dirty_root *dirty;
+	u64 running_trans_id = root->fs_info->running_transaction->transid;
+	if (root->ref_cows && root->last_trans < running_trans_id) {
 		WARN_ON(root == root->fs_info->extent_root);
-		WARN_ON(root->root_item.refs == 0);
-		WARN_ON(root->commit_root != root->node);
-
-		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-			   (unsigned long)root->root_key.objectid,
-			   BTRFS_ROOT_TRANS_TAG);
-		root->last_trans = trans->transid;
-		btrfs_init_reloc_root(trans, root);
-	}
-	return 0;
-}
+		if (root->root_item.refs != 0) {
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_TRANS_TAG);
+
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+			dirty->latest_root = root;
+			INIT_LIST_HEAD(&dirty->list);
 
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root)
-{
-	if (!root->ref_cows)
-		return 0;
+			root->commit_root = btrfs_root_node(root);
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (root->last_trans == trans->transid) {
-		mutex_unlock(&root->fs_info->trans_mutex);
-		return 0;
-	}
+			memcpy(dirty->root, root, sizeof(*root));
+			spin_lock_init(&dirty->root->node_lock);
+			spin_lock_init(&dirty->root->list_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+			mutex_init(&dirty->root->log_mutex);
+			INIT_LIST_HEAD(&dirty->root->dead_list);
+			dirty->root->node = root->commit_root;
+			dirty->root->commit_root = NULL;
 
-	record_root_in_trans(trans, root);
-	mutex_unlock(&root->fs_info->trans_mutex);
+			spin_lock(&root->list_lock);
+			list_add(&dirty->root->dead_list, &root->dead_list);
+			spin_unlock(&root->list_lock);
+
+			root->dirty_root = dirty;
+		} else {
+			WARN_ON(1);
+		}
+		root->last_trans = running_trans_id;
+	}
 	return 0;
 }
 
@@ -172,6 +181,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
+	btrfs_record_root_in_trans(root);
 	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
@@ -182,7 +192,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->delayed_ref_updates = 0;
 
 	root->fs_info->running_transaction->use_count++;
-	record_root_in_trans(h, root);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
 }
@@ -224,7 +233,6 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
-#if 0
 /*
  * rate limit against the drop_snapshot code.  This helps to slow down new
  * operations if the drop_snapshot code isn't able to keep up.
@@ -265,7 +273,6 @@ static void throttle_on_drops(struct btrfs_root *root)
 			goto harder;
 	}
 }
-#endif
 
 void btrfs_throttle(struct btrfs_root *root)
 {
@@ -273,6 +280,7 @@ void btrfs_throttle(struct btrfs_root *root)
 	if (!root->fs_info->open_ioctl_trans)
 		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
+	throttle_on_drops(root);
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -315,6 +323,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
+	if (throttle)
+		throttle_on_drops(root);
+
 	return 0;
 }
 
@@ -451,8 +462,12 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
 			break;
+		btrfs_set_root_bytenr(&root->root_item,
+				       root->node->start);
+		btrfs_set_root_level(&root->root_item,
+				     btrfs_header_level(root->node));
+		btrfs_set_root_generation(&root->root_item, trans->transid);
 
-		btrfs_set_root_node(&root->root_item, root->node);
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
@@ -462,16 +477,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 		BUG_ON(ret);
 	}
-	free_extent_buffer(root->commit_root);
-	root->commit_root = btrfs_root_node(root);
 	return 0;
 }
 
 /*
  * update all the cowonly tree roots on disk
  */
-static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root)
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
@@ -507,54 +520,118 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
  * a dirty root struct and adds it into the list of dead roots that need to
  * be deleted
  */
-int btrfs_add_dead_root(struct btrfs_root *root)
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 {
+	struct btrfs_dirty_root *dirty;
+
+	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+	if (!dirty)
+		return -ENOMEM;
+	dirty->root = root;
+	dirty->latest_root = latest;
+
 	mutex_lock(&root->fs_info->trans_mutex);
-	list_add(&root->root_list, &root->fs_info->dead_roots);
+	list_add(&dirty->list, &latest->fs_info->dead_roots);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return 0;
 }
 
 /*
- * update all the cowonly tree roots on disk
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot.  This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
  */
-static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root)
+static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+				    struct radix_tree_root *radix,
+				    struct list_head *list)
 {
+	struct btrfs_dirty_root *dirty;
 	struct btrfs_root *gang[8];
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *root;
 	int i;
 	int ret;
 	int err = 0;
+	u32 refs;
 
 	while (1) {
-		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
-						 (void **)gang, 0,
+		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 						 ARRAY_SIZE(gang),
 						 BTRFS_ROOT_TRANS_TAG);
 		if (ret == 0)
 			break;
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
-			radix_tree_tag_clear(&fs_info->fs_roots_radix,
-					(unsigned long)root->root_key.objectid,
-					BTRFS_ROOT_TRANS_TAG);
+			radix_tree_tag_clear(radix,
+				     (unsigned long)root->root_key.objectid,
+				     BTRFS_ROOT_TRANS_TAG);
+
+			BUG_ON(!root->ref_tree);
+			dirty = root->dirty_root;
 
 			btrfs_free_log(trans, root);
-			btrfs_update_reloc_root(trans, root);
+			btrfs_free_reloc_root(trans, root);
 
-			if (root->commit_root == root->node)
-				continue;
+			if (root->commit_root == root->node) {
+				WARN_ON(root->node->start !=
+					btrfs_root_bytenr(&root->root_item));
 
-			free_extent_buffer(root->commit_root);
-			root->commit_root = btrfs_root_node(root);
+				free_extent_buffer(root->commit_root);
+				root->commit_root = NULL;
+				root->dirty_root = NULL;
+
+				spin_lock(&root->list_lock);
+				list_del_init(&dirty->root->dead_list);
+				spin_unlock(&root->list_lock);
 
-			btrfs_set_root_node(&root->root_item, root->node);
-			err = btrfs_update_root(trans, fs_info->tree_root,
+				kfree(dirty->root);
+				kfree(dirty);
+
+				/* make sure to update the root on disk
+				 * so we get any updates to the block used
+				 * counts
+				 */
+				err = btrfs_update_root(trans,
+						root->fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
+				continue;
+			}
+
+			memset(&root->root_item.drop_progress, 0,
+			       sizeof(struct btrfs_disk_key));
+			root->root_item.drop_level = 0;
+			root->commit_root = NULL;
+			root->dirty_root = NULL;
+			root->root_key.offset = root->fs_info->generation;
+			btrfs_set_root_bytenr(&root->root_item,
+					      root->node->start);
+			btrfs_set_root_level(&root->root_item,
+					     btrfs_header_level(root->node));
+			btrfs_set_root_generation(&root->root_item,
+						  root->root_key.offset);
+
+			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
 			if (err)
 				break;
+
+			refs = btrfs_root_refs(&dirty->root->root_item);
+			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
+			err = btrfs_update_root(trans, root->fs_info->tree_root,
+						&dirty->root->root_key,
+						&dirty->root->root_item);
+
+			BUG_ON(err);
+			if (refs == 1) {
+				list_add(&dirty->list, list);
+			} else {
+				WARN_ON(1);
+				free_extent_buffer(dirty->root->node);
+				kfree(dirty->root);
+				kfree(dirty);
+			}
 		}
 	}
 	return err;
@@ -611,8 +688,12 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 				TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&info->trans_mutex);
 
+		atomic_dec(&info->throttles);
+		wake_up(&info->transaction_throttle);
+
 		schedule();
 
+		atomic_inc(&info->throttles);
 		mutex_lock(&info->trans_mutex);
 		finish_wait(&info->transaction_wait, &wait);
 	}
@@ -624,61 +705,111 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
  */
-int btrfs_drop_dead_root(struct btrfs_root *root)
+static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+				     struct list_head *list)
 {
+	struct btrfs_dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	unsigned long nr;
-	int ret;
+	u64 num_bytes;
+	u64 bytes_used;
+	u64 max_useless;
+	int ret = 0;
+	int err;
 
-	while (1) {
-		/*
-		 * we don't want to jump in and create a bunch of
-		 * delayed refs if the transaction is starting to close
-		 */
-		wait_transaction_pre_flush(tree_root->fs_info);
-		trans = btrfs_start_transaction(tree_root, 1);
+	while (!list_empty(list)) {
+		struct btrfs_root *root;
 
-		/*
-		 * we've joined a transaction, make sure it isn't
-		 * closing right now
-		 */
-		if (trans->transaction->delayed_refs.flushing) {
-			btrfs_end_transaction(trans, tree_root);
-			continue;
+		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
+		list_del_init(&dirty->list);
+
+		num_bytes = btrfs_root_used(&dirty->root->root_item);
+		root = dirty->latest_root;
+		atomic_inc(&root->fs_info->throttles);
+
+		while (1) {
+			/*
+			 * we don't want to jump in and create a bunch of
+			 * delayed refs if the transaction is starting to close
+			 */
+			wait_transaction_pre_flush(tree_root->fs_info);
+			trans = btrfs_start_transaction(tree_root, 1);
+
+			/*
+			 * we've joined a transaction, make sure it isn't
+			 * closing right now
+			 */
+			if (trans->transaction->delayed_refs.flushing) {
+				btrfs_end_transaction(trans, tree_root);
+				continue;
+			}
+
+			mutex_lock(&root->fs_info->drop_mutex);
+			ret = btrfs_drop_snapshot(trans, dirty->root);
+			if (ret != -EAGAIN)
+				break;
+			mutex_unlock(&root->fs_info->drop_mutex);
+
+			err = btrfs_update_root(trans,
+					tree_root,
+					&dirty->root->root_key,
+					&dirty->root->root_item);
+			if (err)
+				ret = err;
+			nr = trans->blocks_used;
+			ret = btrfs_end_transaction(trans, tree_root);
+			BUG_ON(ret);
+
+			btrfs_btree_balance_dirty(tree_root, nr);
+			cond_resched();
 		}
+		BUG_ON(ret);
+		atomic_dec(&root->fs_info->throttles);
+		wake_up(&root->fs_info->transaction_throttle);
 
-		ret = btrfs_drop_snapshot(trans, root);
-		if (ret != -EAGAIN)
-			break;
+		num_bytes -= btrfs_root_used(&dirty->root->root_item);
+		bytes_used = btrfs_root_used(&root->root_item);
+		if (num_bytes) {
+			mutex_lock(&root->fs_info->trans_mutex);
+			btrfs_record_root_in_trans(root);
+			mutex_unlock(&root->fs_info->trans_mutex);
+			btrfs_set_root_used(&root->root_item,
+					    bytes_used - num_bytes);
+		}
 
-		ret = btrfs_update_root(trans, tree_root,
-					&root->root_key,
-					&root->root_item);
-		if (ret)
+		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+		if (ret) {
+			BUG();
 			break;
+		}
+		mutex_unlock(&root->fs_info->drop_mutex);
+
+		spin_lock(&root->list_lock);
+		list_del_init(&dirty->root->dead_list);
+		if (!list_empty(&root->dead_list)) {
+			struct btrfs_root *oldest;
+			oldest = list_entry(root->dead_list.prev,
+					    struct btrfs_root, dead_list);
+			max_useless = oldest->root_key.offset - 1;
+		} else {
+			max_useless = root->root_key.offset - 1;
+		}
+		spin_unlock(&root->list_lock);
 
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		ret = btrfs_remove_leaf_refs(root, max_useless, 0);
+		BUG_ON(ret);
+
+		free_extent_buffer(dirty->root->node);
+		kfree(dirty->root);
+		kfree(dirty);
+
 		btrfs_btree_balance_dirty(tree_root, nr);
 		cond_resched();
 	}
-	BUG_ON(ret);
-
-	ret = btrfs_del_root(trans, tree_root, &root->root_key);
-	BUG_ON(ret);
-
-	nr = trans->blocks_used;
-	ret = btrfs_end_transaction(trans, tree_root);
-	BUG_ON(ret);
-
-	free_extent_buffer(root->node);
-	free_extent_buffer(root->commit_root);
-	kfree(root);
-
-	btrfs_btree_balance_dirty(tree_root, nr);
 	return ret;
 }
 
@@ -708,23 +839,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
-	record_root_in_trans(trans, root);
+	btrfs_record_root_in_trans(root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
-	key.offset = 0;
+	key.offset = trans->transid;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
-	btrfs_set_lock_blocking(old);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
 
-	btrfs_set_root_node(new_root_item, tmp);
+	btrfs_set_root_bytenr(new_root_item, tmp->start);
+	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+	btrfs_set_root_generation(new_root_item, trans->transid);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
 	btrfs_tree_unlock(tmp);
@@ -832,24 +964,6 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static void update_super_roots(struct btrfs_root *root)
-{
-	struct btrfs_root_item *root_item;
-	struct btrfs_super_block *super;
-
-	super = &root->fs_info->super_copy;
-
-	root_item = &root->fs_info->chunk_root->root_item;
-	super->chunk_root = root_item->bytenr;
-	super->chunk_root_generation = root_item->generation;
-	super->chunk_root_level = root_item->level;
-
-	root_item = &root->fs_info->tree_root->root_item;
-	super->root = root_item->bytenr;
-	super->generation = root_item->generation;
-	super->root_level = root_item->level;
-}
-
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -857,6 +971,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct list_head dirty_fs_roots;
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
@@ -883,6 +999,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	mutex_lock(&root->fs_info->trans_mutex);
+	INIT_LIST_HEAD(&dirty_fs_roots);
 	if (cur_trans->in_commit) {
 		cur_trans->use_count++;
 		mutex_unlock(&root->fs_info->trans_mutex);
@@ -988,36 +1105,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 * with the tree-log code.
 	 */
 	mutex_lock(&root->fs_info->tree_log_mutex);
+	/*
+	 * keep tree reloc code from adding new reloc trees
+	 */
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+
 
-	ret = commit_fs_roots(trans, root);
+	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+			      &dirty_fs_roots);
 	BUG_ON(ret);
 
-	/* commit_fs_roots gets rid of all the tree log roots, it is now
+	/* add_dirty_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
-	ret = commit_cowonly_roots(trans, root);
+	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
 	cur_trans = root->fs_info->running_transaction;
 	spin_lock(&root->fs_info->new_trans_lock);
 	root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->new_trans_lock);
-
-	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
-			    root->fs_info->tree_root->node);
-	free_extent_buffer(root->fs_info->tree_root->commit_root);
-	root->fs_info->tree_root->commit_root =
-				btrfs_root_node(root->fs_info->tree_root);
-
-	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
-			    root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	root->fs_info->chunk_root->commit_root =
-				btrfs_root_node(root->fs_info->chunk_root);
-
-	update_super_roots(root);
+	btrfs_set_super_generation(&root->fs_info->super_copy,
+				   cur_trans->transid);
+	btrfs_set_super_root(&root->fs_info->super_copy,
+			     root->fs_info->tree_root->node->start);
+	btrfs_set_super_root_level(&root->fs_info->super_copy,
+			   btrfs_header_level(root->fs_info->tree_root->node));
+
+	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+				   chunk_root->node->start);
+	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+					 btrfs_header_level(chunk_root->node));
+	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+				btrfs_header_generation(chunk_root->node));
 
 	if (!root->fs_info->log_root_recovering) {
 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1031,6 +1153,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->blocked = 0;
 
+	wake_up(&root->fs_info->transaction_throttle);
 	wake_up(&root->fs_info->transaction_wait);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
@@ -1047,6 +1170,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	kfree(pinned_copy);
 
+	btrfs_drop_dead_reloc_roots(root);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
 	/* do the directory inserts of any pending snapshot creations */
 	finish_pending_snapshots(trans, root->fs_info);
 
@@ -1060,9 +1186,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
+	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
+	if (root->fs_info->closing)
+		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (root->fs_info->closing)
+		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
 	return ret;
 }
 
@@ -1071,17 +1204,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
-	LIST_HEAD(list);
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
-	mutex_lock(&fs_info->trans_mutex);
-	list_splice_init(&fs_info->dead_roots, &list);
-	mutex_unlock(&fs_info->trans_mutex);
+	struct list_head dirty_roots;
+	INIT_LIST_HEAD(&dirty_roots);
+again:
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
 
-	while (!list_empty(&list)) {
-		root = list_entry(list.next, struct btrfs_root, root_list);
-		list_del_init(&root->root_list);
-		btrfs_drop_dead_root(root);
+	if (!list_empty(&dirty_roots)) {
+		drop_dirty_roots(root, &dirty_roots);
+		goto again;
 	}
 	return 0;
 }
diff --git a/trunk/fs/btrfs/transaction.h b/trunk/fs/btrfs/transaction.h
index 961c3ee5a2e1..94f5bde2b58d 100644
--- a/trunk/fs/btrfs/transaction.h
+++ b/trunk/fs/btrfs/transaction.h
@@ -62,6 +62,12 @@ struct btrfs_pending_snapshot {
 	struct list_head list;
 };
 
+struct btrfs_dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+};
+
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 					       struct inode *inode)
 {
@@ -94,8 +100,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
-int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_drop_dead_root(struct btrfs_root *root);
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -103,8 +108,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages);
 #endif
diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c
index c13922206d1b..db5e212e8445 100644
--- a/trunk/fs/btrfs/tree-log.c
+++ b/trunk/fs/btrfs/tree-log.c
@@ -430,16 +430,18 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 					     u64 objectid)
 {
-	struct btrfs_key key;
 	struct inode *inode;
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
 
-	key.objectid = objectid;
-	key.type = BTRFS_INODE_ITEM_KEY;
-	key.offset = 0;
-	inode = btrfs_iget(root->fs_info->sb, &key, root);
-	if (IS_ERR(inode)) {
-		inode = NULL;
-	} else if (is_bad_inode(inode)) {
+	}
+	if (is_bad_inode(inode)) {
 		iput(inode);
 		inode = NULL;
 	}
@@ -539,7 +541,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-		u64 offset;
 		unsigned long dest_offset;
 		struct btrfs_key ins;
 
@@ -554,7 +555,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 		ins.type = BTRFS_EXTENT_ITEM_KEY;
-		offset = key->offset - btrfs_file_extent_offset(eb, item);
 
 		if (ins.objectid > 0) {
 			u64 csum_start;
@@ -569,16 +569,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			if (ret == 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
-						0, root->root_key.objectid,
-						key->objectid, offset);
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid);
 			} else {
 				/*
 				 * insert the extent pointer in the extent
 				 * allocation tree
 				 */
-				ret = btrfs_alloc_logged_file_extent(trans,
-						root, root->root_key.objectid,
-						key->objectid, offset, &ins);
+				ret = btrfs_alloc_logged_extent(trans, root,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						&ins);
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
@@ -1703,6 +1706,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
+				ret = btrfs_drop_leaf_ref(trans, root, next);
+				BUG_ON(ret);
+
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_reserved_extent(root,
@@ -1747,6 +1753,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		btrfs_wait_tree_block_writeback(next);
 		btrfs_tree_unlock(next);
 
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, next);
+			BUG_ON(ret);
+		}
 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
 		BUG_ON(ret);
@@ -1801,6 +1811,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
+				if (*level == 0) {
+					ret = btrfs_drop_leaf_ref(trans, root,
+								  next);
+					BUG_ON(ret);
+				}
+
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_reserved_extent(root,
 						path->nodes[*level]->start,
@@ -1868,6 +1884,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			btrfs_wait_tree_block_writeback(next);
 			btrfs_tree_unlock(next);
 
+			if (orig_level == 0) {
+				ret = btrfs_drop_leaf_ref(trans, log,
+							  next);
+				BUG_ON(ret);
+			}
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
 			ret = btrfs_free_reserved_extent(log, next->start,
@@ -2006,7 +2027,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
-	btrfs_set_root_node(&log->root_item, log->node);
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
 
 	root->log_batch = 0;
 	root->log_transid++;
@@ -2558,7 +2581,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				       ins_keys, ins_sizes, nr);
 	BUG_ON(ret);
 
-	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
+	for (i = 0; i < nr; i++) {
 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
 						   dst_path->slots[0]);
 
@@ -2594,31 +2617,36 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 			found_type = btrfs_file_extent_type(src, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG ||
 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-				u64 ds, dl, cs, cl;
-				ds = btrfs_file_extent_disk_bytenr(src,
-								extent);
-				/* ds == 0 is a hole */
-				if (ds == 0)
-					continue;
-
-				dl = btrfs_file_extent_disk_num_bytes(src,
-								extent);
-				cs = btrfs_file_extent_offset(src, extent);
-				cl = btrfs_file_extent_num_bytes(src,
-								extent);;
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				u64 cs = btrfs_file_extent_offset(src, extent);
+				u64 cl = btrfs_file_extent_num_bytes(src,
+								     extent);;
 				if (btrfs_file_extent_compression(src,
 								  extent)) {
 					cs = 0;
 					cl = dl;
 				}
-
-				ret = btrfs_lookup_csums_range(
-						log->fs_info->csum_root,
-						ds + cs, ds + cs + cl - 1,
-						&ordered_sums);
-				BUG_ON(ret);
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   dst_path->nodes[0]->start,
+						   BTRFS_TREE_LOG_OBJECTID,
+						   trans->transid,
+						   ins_keys[i].objectid);
+					BUG_ON(ret);
+					ret = btrfs_lookup_csums_range(
+						   log->fs_info->csum_root,
+						   ds + cs, ds + cs + cl - 1,
+						   &ordered_sums);
+					BUG_ON(ret);
+				}
 			}
 		}
+		dst_path->slots[0]++;
 	}
 
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3001,7 +3029,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
-		btrfs_record_root_in_trans(trans, wc.replay_dest);
+		mutex_lock(&fs_info->trans_mutex);
+		btrfs_record_root_in_trans(wc.replay_dest);
+		mutex_unlock(&fs_info->trans_mutex);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
@@ -3019,7 +3049,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		key.offset = found_key.offset - 1;
 		wc.replay_dest->log_root = NULL;
 		free_extent_buffer(log->node);
-		free_extent_buffer(log->commit_root);
 		kfree(log);
 
 		if (found_key.offset == 0)
diff --git a/trunk/fs/btrfs/volumes.c b/trunk/fs/btrfs/volumes.c
index 3ab80e9cd767..a6d35b0054ca 100644
--- a/trunk/fs/btrfs/volumes.c
+++ b/trunk/fs/btrfs/volumes.c
@@ -161,10 +161,8 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	int again = 0;
 	unsigned long num_run;
 	unsigned long num_sync_run;
-	unsigned long batch_run = 0;
 	unsigned long limit;
 	unsigned long last_waited = 0;
-	int force_reg = 0;
 
 	bdi = blk_get_backing_dev_info(device->bdev);
 	fs_info = device->dev_root->fs_info;
@@ -178,22 +176,19 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 
 loop:
 	spin_lock(&device->io_lock);
+	num_run = 0;
 
 loop_lock:
-	num_run = 0;
 
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
 	 * into the list if we hit congestion
 	 */
-	if (!force_reg && device->pending_sync_bios.head) {
+	if (device->pending_sync_bios.head)
 		pending_bios = &device->pending_sync_bios;
-		force_reg = 1;
-	} else {
+	else
 		pending_bios = &device->pending_bios;
-		force_reg = 0;
-	}
 
 	pending = pending_bios->head;
 	tail = pending_bios->tail;
@@ -233,14 +228,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	while (pending) {
 
 		rmb();
-		/* we want to work on both lists, but do more bios on the
-		 * sync list than the regular list
-		 */
-		if ((num_run > 32 &&
-		    pending_bios != &device->pending_sync_bios &&
-		    device->pending_sync_bios.head) ||
-		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
-		    device->pending_bios.head)) {
+		if (pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head &&
+		    num_run > 16) {
+			cond_resched();
 			spin_lock(&device->io_lock);
 			requeue_list(pending_bios, pending, tail);
 			goto loop_lock;
@@ -258,8 +249,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		submit_bio(cur->bi_rw, cur);
 		num_run++;
-		batch_run++;
-
 		if (bio_sync(cur))
 			num_sync_run++;
 
@@ -276,7 +265,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct io_context *ioc;
 
@@ -377,7 +366,6 @@ static noinline int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		mutex_init(&fs_devices->device_list_mutex);
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -404,11 +392,7 @@ static noinline int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		INIT_LIST_HEAD(&device->dev_alloc_list);
-
-		mutex_lock(&fs_devices->device_list_mutex);
 		list_add(&device->dev_list, &fs_devices->devices);
-		mutex_unlock(&fs_devices->device_list_mutex);
-
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
@@ -434,12 +418,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	INIT_LIST_HEAD(&fs_devices->devices);
 	INIT_LIST_HEAD(&fs_devices->alloc_list);
 	INIT_LIST_HEAD(&fs_devices->list);
-	mutex_init(&fs_devices->device_list_mutex);
 	fs_devices->latest_devid = orig->latest_devid;
 	fs_devices->latest_trans = orig->latest_trans;
 	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 
-	mutex_lock(&orig->device_list_mutex);
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
@@ -461,10 +443,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
-	mutex_unlock(&orig->device_list_mutex);
 	return fs_devices;
 error:
-	mutex_unlock(&orig->device_list_mutex);
 	free_fs_devices(fs_devices);
 	return ERR_PTR(-ENOMEM);
 }
@@ -475,7 +455,6 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 
 	mutex_lock(&uuid_mutex);
 again:
-	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata)
 			continue;
@@ -495,7 +474,6 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 		kfree(device->name);
 		kfree(device);
 	}
-	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
@@ -616,9 +594,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		device->in_fs_metadata = 0;
 		device->mode = flags;
 
-		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
-			fs_devices->rotating = 1;
-
 		fs_devices->open_devices++;
 		if (device->writeable) {
 			fs_devices->rw_devices++;
@@ -1146,14 +1121,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
-		mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 		list_for_each_entry(tmp, devices, dev_list) {
 			if (tmp->in_fs_metadata && !tmp->bdev) {
 				device = tmp;
 				break;
 			}
 		}
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 		bdev = NULL;
 		bh = NULL;
 		disk_super = NULL;
@@ -1208,16 +1181,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		goto error_brelse;
 
 	device->in_fs_metadata = 0;
-
-	/*
-	 * the device list mutex makes sure that we don't change
-	 * the device list while someone else is writing out all
-	 * the device supers.
-	 */
-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_del_init(&device->dev_list);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-
 	device->fs_devices->num_devices--;
 
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1311,7 +1275,6 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	seed_devices->opened = 1;
 	INIT_LIST_HEAD(&seed_devices->devices);
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
-	mutex_init(&seed_devices->device_list_mutex);
 	list_splice_init(&fs_devices->devices, &seed_devices->devices);
 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1437,10 +1400,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
-	/*
-	 * we have the volume lock, so we don't need the extra
-	 * device list mutex while reading the list here.
-	 */
 	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
@@ -1495,12 +1454,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->fs_devices = root->fs_info->fs_devices;
-
-	/*
-	 * we don't want write_supers to jump in here with our device
-	 * half setup
-	 */
-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
@@ -1509,9 +1462,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->rw_devices++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
-	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
-		root->fs_info->fs_devices->rotating = 1;
-
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
@@ -1519,7 +1469,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes + 1);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	if (seeding_dev) {
 		ret = init_first_rw_device(trans, root, device);
@@ -1722,6 +1671,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
+	printk(KERN_INFO "btrfs relocating chunk %llu\n",
+	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
diff --git a/trunk/fs/btrfs/volumes.h b/trunk/fs/btrfs/volumes.h
index 5139a833f721..5c3ff6d02fd7 100644
--- a/trunk/fs/btrfs/volumes.h
+++ b/trunk/fs/btrfs/volumes.h
@@ -96,12 +96,7 @@ struct btrfs_fs_devices {
 	u64 rw_devices;
 	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
-
-	/* all of the devices in the FS, protected by a mutex
-	 * so we can safely walk it to write out the supers without
-	 * worrying about add/remove by the multi-device code
-	 */
-	struct mutex device_list_mutex;
+	/* all of the devices in the FS */
 	struct list_head devices;
 
 	/* devices not currently being allocated */
@@ -112,11 +107,6 @@ struct btrfs_fs_devices {
 	int seeding;
 
 	int opened;
-
-	/* set when we find or add a device that doesn't have the
-	 * nonrot flag set
-	 */
-	int rotating;
 };
 
 struct btrfs_bio_stripe {
diff --git a/trunk/fs/cachefiles/interface.c b/trunk/fs/cachefiles/interface.c
index 431accd475a7..1e962348d111 100644
--- a/trunk/fs/cachefiles/interface.c
+++ b/trunk/fs/cachefiles/interface.c
@@ -354,9 +354,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disc */
 	cachefiles_begin_secure(cache, &saved_cred);
-	down_read(&cache->mnt->mnt_sb->s_umount);
-	ret = sync_filesystem(cache->mnt->mnt_sb);
-	up_read(&cache->mnt->mnt_sb->s_umount);
+	ret = fsync_super(cache->mnt->mnt_sb);
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO)
diff --git a/trunk/fs/char_dev.c b/trunk/fs/char_dev.c
index b7c9d5187a75..38f71222a552 100644
--- a/trunk/fs/char_dev.c
+++ b/trunk/fs/char_dev.c
@@ -375,6 +375,7 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		p = inode->i_cdev;
 		if (!p) {
 			inode->i_cdev = p = new;
+			inode->i_cindex = idx;
 			list_add(&inode->i_devices, &p->list);
 			new = NULL;
 		} else if (!cdev_get(p))
@@ -404,18 +405,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	return ret;
 }
 
-int cdev_index(struct inode *inode)
-{
-	int idx;
-	struct kobject *kobj;
-
-	kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-	if (!kobj)
-		return -1;
-	kobject_put(kobj);
-	return idx;
-}
-
 void cd_forget(struct inode *inode)
 {
 	spin_lock(&cdev_lock);
@@ -568,7 +557,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/trunk/fs/cifs/cifs_dfs_ref.c b/trunk/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..83d62759c7c7 100644
--- a/trunk/fs/cifs/cifs_dfs_ref.c
+++ b/trunk/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
+		       follow_down(&nd->path.mnt, &nd->path.dentry))
 			;
 		err = 0;
 	default:
diff --git a/trunk/fs/cifs/cifsfs.c b/trunk/fs/cifs/cifsfs.c
index 0d92114195ab..0a10a59b6392 100644
--- a/trunk/fs/cifs/cifsfs.c
+++ b/trunk/fs/cifs/cifsfs.c
@@ -204,9 +204,6 @@ cifs_put_super(struct super_block *sb)
 		cFYI(1, ("Empty cifs superblock info passed to unmount"));
 		return;
 	}
-
-	lock_kernel();
-
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
 		cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -219,8 +216,7 @@ cifs_put_super(struct super_block *sb)
 
 	unload_nls(cifs_sb->local_nls);
 	kfree(cifs_sb);
-
-	unlock_kernel();
+	return;
 }
 
 static int
diff --git a/trunk/fs/compat.c b/trunk/fs/compat.c
index 6aefb776dfeb..bb2a9b2e8173 100644
--- a/trunk/fs/compat.c
+++ b/trunk/fs/compat.c
@@ -812,8 +812,10 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 		}
 	}
 
+	lock_kernel();
 	retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
 			flags, (void*)data_page);
+	unlock_kernel();
 
  out4:
 	free_page(data_page);
diff --git a/trunk/fs/dcache.c b/trunk/fs/dcache.c
index 9e5cd3c3a6ba..75659a6fd1f8 100644
--- a/trunk/fs/dcache.c
+++ b/trunk/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
 
 	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (d_unlinked(dentry) &&
+	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
 		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
 
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 
 	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (d_unlinked(dentry) &&
+	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
 		(prepend(&end, &buflen, "//deleted", 9) != 0))
 			goto Elong;
 	if (buflen < 1)
@@ -2097,8 +2097,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	read_unlock(&current->fs->lock);
 
 	error = -ENOENT;
+	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (!d_unlinked(pwd.dentry)) {
+	if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
 		char * cwd;
diff --git a/trunk/fs/ecryptfs/super.c b/trunk/fs/ecryptfs/super.c
index 12d649602d3a..fa4c7e7d15d9 100644
--- a/trunk/fs/ecryptfs/super.c
+++ b/trunk/fs/ecryptfs/super.c
@@ -27,7 +27,6 @@
 #include <linux/mount.h>
 #include <linux/key.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -121,13 +120,9 @@ static void ecryptfs_put_super(struct super_block *sb)
 {
 	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
 
-	lock_kernel();
-
 	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
 	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 	ecryptfs_set_superblock_private(sb, NULL);
-
-	unlock_kernel();
 }
 
 /**
diff --git a/trunk/fs/exofs/super.c b/trunk/fs/exofs/super.c
index 8216c5b77b53..9f1985e857e2 100644
--- a/trunk/fs/exofs/super.c
+++ b/trunk/fs/exofs/super.c
@@ -200,21 +200,20 @@ static const struct export_operations exofs_export_ops;
 /*
  * Write the superblock to the OSD
  */
-static int exofs_sync_fs(struct super_block *sb, int wait)
+static void exofs_write_super(struct super_block *sb)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
 	struct osd_request *or;
 	struct osd_obj_id obj;
-	int ret = -ENOMEM;
+	int ret;
 
 	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
 	if (!fscb) {
 		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-		return -ENOMEM;
+		return;
 	}
 
-	lock_super(sb);
 	lock_kernel();
 	sbi = sb->s_fs_info;
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -247,17 +246,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
 	if (or)
 		osd_end_request(or);
 	unlock_kernel();
-	unlock_super(sb);
 	kfree(fscb);
-	return ret;
-}
-
-static void exofs_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		exofs_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
 }
 
 /*
@@ -269,11 +258,6 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
-	lock_kernel();
-
-	if (sb->s_dirt)
-		exofs_write_super(sb);
-
 	/* make sure there are no pending commands */
 	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 	     num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -287,8 +271,6 @@ static void exofs_put_super(struct super_block *sb)
 	osduld_put_device(sbi->s_dev);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 /*
@@ -502,7 +484,6 @@ static const struct super_operations exofs_sops = {
 	.delete_inode   = exofs_delete_inode,
 	.put_super      = exofs_put_super,
 	.write_super    = exofs_write_super,
-	.sync_fs	= exofs_sync_fs,
 	.statfs         = exofs_statfs,
 };
 
diff --git a/trunk/fs/ext2/Makefile b/trunk/fs/ext2/Makefile
index f42af45cfd88..e0b2b43c1fdb 100644
--- a/trunk/fs/ext2/Makefile
+++ b/trunk/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT2_FS) += ext2.o
 
-ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
 	  ioctl.o namei.o super.o symlink.o
 
 ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
diff --git a/trunk/fs/ext2/dir.c b/trunk/fs/ext2/dir.c
index 003500498c22..2999d72153b7 100644
--- a/trunk/fs/ext2/dir.c
+++ b/trunk/fs/ext2/dir.c
@@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.fsync		= simple_fsync,
+	.fsync		= ext2_sync_file,
 };
diff --git a/trunk/fs/ext2/ext2.h b/trunk/fs/ext2/ext2.h
index b2bbf45039e0..3203042b36ef 100644
--- a/trunk/fs/ext2/ext2.h
+++ b/trunk/fs/ext2/ext2.h
@@ -113,6 +113,9 @@ extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
 
+/* fsync.c */
+extern int ext2_sync_file (struct file *, struct dentry *, int);
+
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
 extern void ext2_free_inode (struct inode *);
diff --git a/trunk/fs/ext2/file.c b/trunk/fs/ext2/file.c
index 2b9e47dc9222..45ed07122182 100644
--- a/trunk/fs/ext2/file.c
+++ b/trunk/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= simple_fsync,
+	.fsync		= ext2_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
 	.mmap		= xip_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= simple_fsync,
+	.fsync		= ext2_sync_file,
 };
 #endif
 
diff --git a/trunk/fs/ext2/fsync.c b/trunk/fs/ext2/fsync.c
new file mode 100644
index 000000000000..fc66c93fcb5c
--- /dev/null
+++ b/trunk/fs/ext2/fsync.c
@@ -0,0 +1,50 @@
+/*
+ *  linux/fs/ext2/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@dcs.ed.ac.uk)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ * 
+ *  ext2fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ * 
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s. 
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include "ext2.h"
+#include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
+
+
+/*
+ *	File may be NULL when we are called. Perhaps we shouldn't
+ *	even pass file to fsync ?
+ */
+
+int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = ext2_sync_inode(inode);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
diff --git a/trunk/fs/ext2/inode.c b/trunk/fs/ext2/inode.c
index 29ed682061f6..acf678831103 100644
--- a/trunk/fs/ext2/inode.c
+++ b/trunk/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
 
+static int ext2_update_inode(struct inode * inode, int do_sync);
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -64,7 +66,7 @@ void ext2_delete_inode (struct inode * inode)
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
 	mark_inode_dirty(inode);
-	ext2_write_inode(inode, inode_needs_sync(inode));
+	ext2_update_inode(inode, inode_needs_sync(inode));
 
 	inode->i_size = 0;
 	if (inode->i_blocks)
@@ -1335,7 +1337,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 	return ERR_PTR(ret);
 }
 
-int ext2_write_inode(struct inode *inode, int do_sync)
+static int ext2_update_inode(struct inode * inode, int do_sync)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -1440,6 +1442,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
 	return err;
 }
 
+int ext2_write_inode(struct inode *inode, int wait)
+{
+	return ext2_update_inode(inode, wait);
+}
+
 int ext2_sync_inode(struct inode *inode)
 {
 	struct writeback_control wbc = {
diff --git a/trunk/fs/ext2/super.c b/trunk/fs/ext2/super.c
index 458999638c3d..e3c748faf2db 100644
--- a/trunk/fs/ext2/super.c
+++ b/trunk/fs/ext2/super.c
@@ -42,7 +42,6 @@ static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
-static int ext2_sync_fs(struct super_block *sb, int wait);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -115,11 +114,6 @@ static void ext2_put_super (struct super_block * sb)
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
-	lock_kernel();
-
-	if (sb->s_dirt)
-		ext2_write_super(sb);
-
 	ext2_xattr_put_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct ext2_super_block *es = sbi->s_es;
@@ -141,7 +135,7 @@ static void ext2_put_super (struct super_block * sb)
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 
-	unlock_kernel();
+	return;
 }
 
 static struct kmem_cache * ext2_inode_cachep;
@@ -310,7 +304,6 @@ static const struct super_operations ext2_sops = {
 	.delete_inode	= ext2_delete_inode,
 	.put_super	= ext2_put_super,
 	.write_super	= ext2_write_super,
-	.sync_fs	= ext2_sync_fs,
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.clear_inode	= ext2_clear_inode,
@@ -1134,36 +1127,25 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
  * set s_state to EXT2_VALID_FS after some corrections.
  */
 
-static int ext2_sync_fs(struct super_block *sb, int wait)
+void ext2_write_super (struct super_block * sb)
 {
-	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
-
+	struct ext2_super_block * es;
 	lock_kernel();
-	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
-		ext2_debug("setting valid to 0\n");
-		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-		es->s_free_blocks_count =
-			cpu_to_le32(ext2_count_free_blocks(sb));
-		es->s_free_inodes_count =
-			cpu_to_le32(ext2_count_free_inodes(sb));
-		es->s_mtime = cpu_to_le32(get_seconds());
-		ext2_sync_super(sb, es);
-	} else {
-		ext2_commit_super(sb, es);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		es = EXT2_SB(sb)->s_es;
+
+		if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+			ext2_debug ("setting valid to 0\n");
+			es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+			es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+			es->s_mtime = cpu_to_le32(get_seconds());
+			ext2_sync_super(sb, es);
+		} else
+			ext2_commit_super (sb, es);
 	}
 	sb->s_dirt = 0;
 	unlock_kernel();
-
-	return 0;
-}
-
-
-void ext2_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		ext2_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
 }
 
 static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1175,8 +1157,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	unsigned long old_sb_flags;
 	int err;
 
-	lock_kernel();
-
 	/* Store the old options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1212,16 +1192,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
 		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 	}
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
-	}
 	if (*flags & MS_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
-			unlock_kernel();
+		    !(sbi->s_mount_state & EXT2_VALID_FS))
 			return 0;
-		}
 		/*
 		 * OK, we are remounting a valid rw partition rdonly, so set
 		 * the rdonly flag and then mark the partition as valid again.
@@ -1248,14 +1224,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			sb->s_flags &= ~MS_RDONLY;
 	}
 	ext2_sync_super(sb, es);
-	unlock_kernel();
 	return 0;
 restore_opts:
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sb->s_flags = old_sb_flags;
-	unlock_kernel();
 	return err;
 }
 
diff --git a/trunk/fs/ext3/balloc.c b/trunk/fs/ext3/balloc.c
index 27967f92e820..225202db8974 100644
--- a/trunk/fs/ext3/balloc.c
+++ b/trunk/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 		count = overflow;
 		goto do_more;
 	}
-
+	sb->s_dirt = 1;
 error_return:
 	brelse(bitmap_bh);
 	ext3_std_error(sb, err);
@@ -1708,6 +1708,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	if (!fatal)
 		fatal = err;
 
+	sb->s_dirt = 1;
 	if (fatal)
 		goto out;
 
diff --git a/trunk/fs/ext3/ialloc.c b/trunk/fs/ext3/ialloc.c
index b39991285136..dd13d60d524b 100644
--- a/trunk/fs/ext3/ialloc.c
+++ b/trunk/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
 	if (!fatal)
 		fatal = err;
-
+	sb->s_dirt = 1;
 error_return:
 	brelse(bitmap_bh);
 	ext3_std_error(sb, fatal);
@@ -537,6 +537,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
+	sb->s_dirt = 1;
 
 	inode->i_uid = current_fsuid();
 	if (test_opt (sb, GRPID))
diff --git a/trunk/fs/ext3/inode.c b/trunk/fs/ext3/inode.c
index b0248c6d5d4c..fcfa24361856 100644
--- a/trunk/fs/ext3/inode.c
+++ b/trunk/fs/ext3/inode.c
@@ -2960,6 +2960,7 @@ static int ext3_do_update_inode(handle_t *handle,
 				ext3_update_dynamic_rev(sb);
 				EXT3_SET_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
+				sb->s_dirt = 1;
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
 						EXT3_SB(sb)->s_sbh);
diff --git a/trunk/fs/ext3/resize.c b/trunk/fs/ext3/resize.c
index 8a0b26340b54..78fdf3836370 100644
--- a/trunk/fs/ext3/resize.c
+++ b/trunk/fs/ext3/resize.c
@@ -934,6 +934,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 			   EXT3_INODES_PER_GROUP(sb));
 
 	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+	sb->s_dirt = 1;
 
 exit_journal:
 	unlock_super(sb);
@@ -1065,6 +1066,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	}
 	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	sb->s_dirt = 1;
 	unlock_super(sb);
 	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
diff --git a/trunk/fs/ext3/super.c b/trunk/fs/ext3/super.c
index 26aa64dee6aa..3c70d52afb10 100644
--- a/trunk/fs/ext3/super.c
+++ b/trunk/fs/ext3/super.c
@@ -67,6 +67,7 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext3_unfreeze(struct super_block *sb);
+static void ext3_write_super (struct super_block * sb);
 static int ext3_freeze(struct super_block *sb);
 
 /*
@@ -398,8 +399,6 @@ static void ext3_put_super (struct super_block * sb)
 	struct ext3_super_block *es = sbi->s_es;
 	int i, err;
 
-	lock_kernel();
-
 	ext3_xattr_put_super(sb);
 	err = journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
@@ -448,8 +447,7 @@ static void ext3_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-
-	unlock_kernel();
+	return;
 }
 
 static struct kmem_cache *ext3_inode_cachep;
@@ -763,6 +761,7 @@ static const struct super_operations ext3_sops = {
 	.dirty_inode	= ext3_dirty_inode,
 	.delete_inode	= ext3_delete_inode,
 	.put_super	= ext3_put_super,
+	.write_super	= ext3_write_super,
 	.sync_fs	= ext3_sync_fs,
 	.freeze_fs	= ext3_freeze,
 	.unfreeze_fs	= ext3_unfreeze,
@@ -1786,6 +1785,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 #else
 		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
+		sb->s_dirt = 1;
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -2265,6 +2265,7 @@ static int ext3_load_journal(struct super_block *sb,
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
+		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
 		ext3_commit_super(sb, es, 1);
@@ -2307,6 +2308,7 @@ static int ext3_create_journal(struct super_block * sb,
 	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
 
 	es->s_journal_inum = cpu_to_le32(journal_inum);
+	sb->s_dirt = 1;
 
 	/* Make sure we flush the recovery flag to disk. */
 	ext3_commit_super(sb, es, 1);
@@ -2352,6 +2354,7 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		sb->s_dirt = 0;
 		ext3_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
@@ -2410,14 +2413,29 @@ int ext3_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT3_SB(sb)->s_journal;
+	sb->s_dirt = 0;
 	ret = ext3_journal_force_commit(journal);
 	return ret;
 }
 
+/*
+ * Ext3 always journals updates to the superblock itself, so we don't
+ * have to propagate any other updates to the superblock on disk at this
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
+ */
+static void ext3_write_super (struct super_block * sb)
+{
+	if (mutex_trylock(&sb->s_lock) != 0)
+		BUG();
+	sb->s_dirt = 0;
+}
+
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
 	tid_t target;
 
+	sb->s_dirt = 0;
 	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
 		if (wait)
 			log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2433,6 +2451,7 @@ static int ext3_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
+	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal = EXT3_SB(sb)->s_journal;
@@ -2490,10 +2509,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
-	lock_kernel();
-
 	/* Store the original options */
-	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_resuid = sbi->s_resuid;
@@ -2601,8 +2617,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
 			kfree(old_opts.s_qf_names[i]);
 #endif
-	unlock_super(sb);
-	unlock_kernel();
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2619,8 +2633,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
-	unlock_super(sb);
-	unlock_kernel();
 	return err;
 }
 
diff --git a/trunk/fs/ext3/xattr.c b/trunk/fs/ext3/xattr.c
index 545e37c4b91e..83b7be849bd5 100644
--- a/trunk/fs/ext3/xattr.c
+++ b/trunk/fs/ext3/xattr.c
@@ -463,6 +463,7 @@ static void ext3_xattr_update_super_block(handle_t *handle,
 
 	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
 		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
+		sb->s_dirt = 1;
 		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	}
 }
diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c
index 012c4251397e..f016707597a7 100644
--- a/trunk/fs/ext4/super.c
+++ b/trunk/fs/ext4/super.c
@@ -576,11 +576,6 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
-	lock_super(sb);
-	lock_kernel();
-	if (sb->s_dirt)
-		ext4_commit_super(sb, 1);
-
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
@@ -647,6 +642,8 @@ static void ext4_put_super(struct super_block *sb)
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
+	lock_super(sb);
+	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 }
@@ -3336,9 +3333,7 @@ int ext4_force_commit(struct super_block *sb)
 
 static void ext4_write_super(struct super_block *sb)
 {
-	lock_super(sb);
 	ext4_commit_super(sb, 1);
-	unlock_super(sb);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3422,10 +3417,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int i;
 #endif
 
-	lock_kernel();
-
 	/* Store the original options */
-	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_resuid = sbi->s_resuid;
@@ -3559,8 +3551,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
 			kfree(old_opts.s_qf_names[i]);
 #endif
-	unlock_super(sb);
-	unlock_kernel();
 	return 0;
 
 restore_opts:
@@ -3580,8 +3570,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
-	unlock_super(sb);
-	unlock_kernel();
 	return err;
 }
 
diff --git a/trunk/fs/fat/dir.c b/trunk/fs/fat/dir.c
index f3500294eec5..3a7f603b6982 100644
--- a/trunk/fs/fat/dir.c
+++ b/trunk/fs/fat/dir.c
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
-	.fsync		= fat_file_fsync,
+	.fsync		= file_fsync,
 };
 
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 			de++;
 			nr_slots--;
 		}
-		mark_buffer_dirty_inode(bh, dir);
+		mark_buffer_dirty(bh);
 		if (IS_DIRSYNC(dir))
 			err = sync_dirty_buffer(bh);
 		brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 		de--;
 		nr_slots--;
 	}
-	mark_buffer_dirty_inode(bh, dir);
+	mark_buffer_dirty(bh);
 	if (IS_DIRSYNC(dir))
 		err = sync_dirty_buffer(bh);
 	brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
 		}
 		memset(bhs[n]->b_data, 0, sb->s_blocksize);
 		set_buffer_uptodate(bhs[n]);
-		mark_buffer_dirty_inode(bhs[n], dir);
+		mark_buffer_dirty(bhs[n]);
 
 		n++;
 		blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	de[0].size = de[1].size = 0;
 	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
 	set_buffer_uptodate(bhs[0]);
-	mark_buffer_dirty_inode(bhs[0], dir);
+	mark_buffer_dirty(bhs[0]);
 
 	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
 	if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
 			slots += copy;
 			size -= copy;
 			set_buffer_uptodate(bhs[n]);
-			mark_buffer_dirty_inode(bhs[n], dir);
+			mark_buffer_dirty(bhs[n]);
 			if (!size)
 				break;
 			n++;
@@ -1293,7 +1293,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 		for (i = 0; i < long_bhs; i++) {
 			int copy = min_t(int, sb->s_blocksize - offset, size);
 			memcpy(bhs[i]->b_data + offset, slots, copy);
-			mark_buffer_dirty_inode(bhs[i], dir);
+			mark_buffer_dirty(bhs[i]);
 			offset = 0;
 			slots += copy;
 			size -= copy;
@@ -1304,7 +1304,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 			/* Fill the short name slot. */
 			int copy = min_t(int, sb->s_blocksize - offset, size);
 			memcpy(bhs[i]->b_data + offset, slots, copy);
-			mark_buffer_dirty_inode(bhs[i], dir);
+			mark_buffer_dirty(bhs[i]);
 			if (IS_DIRSYNC(dir))
 				err = sync_dirty_buffer(bhs[i]);
 		}
diff --git a/trunk/fs/fat/fat.h b/trunk/fs/fat/fat.h
index e4d88527b5dd..ea440d65819c 100644
--- a/trunk/fs/fat/fat.h
+++ b/trunk/fs/fat/fat.h
@@ -74,7 +74,6 @@ struct msdos_sb_info {
 
 	int fatent_shift;
 	struct fatent_operations *fatent_ops;
-	struct inode *fat_inode;
 
 	spinlock_t inode_hash_lock;
 	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -252,7 +251,6 @@ struct fat_entry {
 	} u;
 	int nr_bhs;
 	struct buffer_head *bhs[2];
-	struct inode *fat_inode;
 };
 
 static inline void fatent_init(struct fat_entry *fatent)
@@ -261,7 +259,6 @@ static inline void fatent_init(struct fat_entry *fatent)
 	fatent->entry = 0;
 	fatent->u.ent32_p = NULL;
 	fatent->bhs[0] = fatent->bhs[1] = NULL;
-	fatent->fat_inode = NULL;
 }
 
 static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -278,7 +275,6 @@ static inline void fatent_brelse(struct fat_entry *fatent)
 		brelse(fatent->bhs[i]);
 	fatent->nr_bhs = 0;
 	fatent->bhs[0] = fatent->bhs[1] = NULL;
-	fatent->fat_inode = NULL;
 }
 
 extern void fat_ent_access_init(struct super_block *sb);
@@ -300,8 +296,6 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
-			  int datasync);
 
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/trunk/fs/fat/fatent.c b/trunk/fs/fat/fatent.c
index 618f5305c2e4..da6eea47872f 100644
--- a/trunk/fs/fat/fatent.c
+++ b/trunk/fs/fat/fatent.c
@@ -73,8 +73,6 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	struct buffer_head **bhs = fatent->bhs;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
-	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
-
 	bhs[0] = sb_bread(sb, blocknr);
 	if (!bhs[0])
 		goto err;
@@ -105,7 +103,6 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
-	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
 		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -170,9 +167,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
 	}
 	spin_unlock(&fat12_entry_lock);
 
-	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+	mark_buffer_dirty(fatent->bhs[0]);
 	if (fatent->nr_bhs == 2)
-		mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
+		mark_buffer_dirty(fatent->bhs[1]);
 }
 
 static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -181,7 +178,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
 		new = EOF_FAT16;
 
 	*fatent->u.ent16_p = cpu_to_le16(new);
-	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+	mark_buffer_dirty(fatent->bhs[0]);
 }
 
 static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -192,7 +189,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
 	WARN_ON(new & 0xf0000000);
 	new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
 	*fatent->u.ent32_p = cpu_to_le32(new);
-	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+	mark_buffer_dirty(fatent->bhs[0]);
 }
 
 static int fat12_ent_next(struct fat_entry *fatent)
@@ -384,7 +381,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
 			}
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
-			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
+			mark_buffer_dirty(c_bh);
 			if (sb->s_flags & MS_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);
 			brelse(c_bh);
diff --git a/trunk/fs/fat/file.c b/trunk/fs/fat/file.c
index e955a56b4e5e..0a7f4a9918b3 100644
--- a/trunk/fs/fat/file.c
+++ b/trunk/fs/fat/file.c
@@ -133,18 +133,6 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int res, err;
-
-	res = simple_fsync(filp, dentry, datasync);
-	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
-
-	return res ? res : err;
-}
-
-
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -154,7 +142,7 @@ const struct file_operations fat_file_operations = {
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
-	.fsync		= fat_file_fsync,
+	.fsync		= file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/trunk/fs/fat/inode.c b/trunk/fs/fat/inode.c
index 51a5ecf9000a..296785a0dec8 100644
--- a/trunk/fs/fat/inode.c
+++ b/trunk/fs/fat/inode.c
@@ -441,35 +441,16 @@ static void fat_clear_inode(struct inode *inode)
 
 static void fat_write_super(struct super_block *sb)
 {
-	lock_super(sb);
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY))
 		fat_clusters_flush(sb);
-	unlock_super(sb);
-}
-
-static int fat_sync_fs(struct super_block *sb, int wait)
-{
-	lock_super(sb);
-	fat_clusters_flush(sb);
-	sb->s_dirt = 0;
-	unlock_super(sb);
-
-	return 0;
 }
 
 static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	lock_kernel();
-
-	if (sb->s_dirt)
-		fat_write_super(sb);
-
-	iput(sbi->fat_inode);
-
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
 		sbi->nls_disk = NULL;
@@ -486,8 +467,6 @@ static void fat_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache *fat_inode_cachep;
@@ -653,7 +632,6 @@ static const struct super_operations fat_sops = {
 	.delete_inode	= fat_delete_inode,
 	.put_super	= fat_put_super,
 	.write_super	= fat_write_super,
-	.sync_fs	= fat_sync_fs,
 	.statfs		= fat_statfs,
 	.clear_inode	= fat_clear_inode,
 	.remount_fs	= fat_remount,
@@ -1196,7 +1174,7 @@ static int fat_read_root(struct inode *inode)
 int fat_fill_super(struct super_block *sb, void *data, int silent,
 		   const struct inode_operations *fs_dir_inode_ops, int isvfat)
 {
-	struct inode *root_inode = NULL, *fat_inode = NULL;
+	struct inode *root_inode = NULL;
 	struct buffer_head *bh;
 	struct fat_boot_sector *b;
 	struct msdos_sb_info *sbi;
@@ -1436,11 +1414,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	}
 
 	error = -ENOMEM;
-	fat_inode = new_inode(sb);
-	if (!fat_inode)
-		goto out_fail;
-	MSDOS_I(fat_inode)->i_pos = 0;
-	sbi->fat_inode = fat_inode;
 	root_inode = new_inode(sb);
 	if (!root_inode)
 		goto out_fail;
@@ -1466,8 +1439,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 		       " on dev %s.\n", sb->s_id);
 
 out_fail:
-	if (fat_inode)
-		iput(fat_inode);
 	if (root_inode)
 		iput(root_inode);
 	if (sbi->nls_io)
diff --git a/trunk/fs/fat/namei_msdos.c b/trunk/fs/fat/namei_msdos.c
index 20f522861355..da3f361a37dd 100644
--- a/trunk/fs/fat/namei_msdos.c
+++ b/trunk/fs/fat/namei_msdos.c
@@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		int start = MSDOS_I(new_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		mark_buffer_dirty(dotdot_bh);
 		if (IS_DIRSYNC(new_dir)) {
 			err = sync_dirty_buffer(dotdot_bh);
 			if (err)
@@ -586,7 +586,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		int start = MSDOS_I(old_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		mark_buffer_dirty(dotdot_bh);
 		corrupt |= sync_dirty_buffer(dotdot_bh);
 	}
 error_inode:
diff --git a/trunk/fs/fat/namei_vfat.c b/trunk/fs/fat/namei_vfat.c
index b50ecbe97f83..a0e00e3a46e9 100644
--- a/trunk/fs/fat/namei_vfat.c
+++ b/trunk/fs/fat/namei_vfat.c
@@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		int start = MSDOS_I(new_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		mark_buffer_dirty(dotdot_bh);
 		if (IS_DIRSYNC(new_dir)) {
 			err = sync_dirty_buffer(dotdot_bh);
 			if (err)
@@ -1009,7 +1009,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		int start = MSDOS_I(old_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		mark_buffer_dirty(dotdot_bh);
 		corrupt |= sync_dirty_buffer(dotdot_bh);
 	}
 error_inode:
diff --git a/trunk/fs/file_table.c b/trunk/fs/file_table.c
index 334ce39881f8..54018fe48840 100644
--- a/trunk/fs/file_table.c
+++ b/trunk/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
 		file_take_write(file);
-		error = mnt_clone_write(mnt);
+		error = mnt_want_write(mnt);
 		WARN_ON(error);
 	}
 	return error;
@@ -399,44 +399,6 @@ int fs_may_remount_ro(struct super_block *sb)
 	return 0;
 }
 
-/**
- *	mark_files_ro - mark all files read-only
- *	@sb: superblock in question
- *
- *	All files are marked read-only.  We don't care about pending
- *	delete files so this should be used in 'force' mode only.
- */
-void mark_files_ro(struct super_block *sb)
-{
-	struct file *f;
-
-retry:
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-		struct vfsmount *mnt;
-		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-		       continue;
-		if (!file_count(f))
-			continue;
-		if (!(f->f_mode & FMODE_WRITE))
-			continue;
-		f->f_mode &= ~FMODE_WRITE;
-		if (file_check_writeable(f) != 0)
-			continue;
-		file_release_write(f);
-		mnt = mntget(f->f_path.mnt);
-		file_list_unlock();
-		/*
-		 * This can sleep, so we can't hold
-		 * the file_list_lock() spinlock.
-		 */
-		mnt_drop_write(mnt);
-		mntput(mnt);
-		goto retry;
-	}
-	file_list_unlock();
-}
-
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
diff --git a/trunk/fs/freevxfs/vxfs_super.c b/trunk/fs/freevxfs/vxfs_super.c
index cdbd1654e4cd..1dacda831577 100644
--- a/trunk/fs/freevxfs/vxfs_super.c
+++ b/trunk/fs/freevxfs/vxfs_super.c
@@ -80,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
 {
 	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
 
-	lock_kernel();
-
 	vxfs_put_fake_inode(infp->vsi_fship);
 	vxfs_put_fake_inode(infp->vsi_ilist);
 	vxfs_put_fake_inode(infp->vsi_stilist);
 
 	brelse(infp->vsi_bp);
 	kfree(infp);
-
-	unlock_kernel();
 }
 
 /**
diff --git a/trunk/fs/fs-writeback.c b/trunk/fs/fs-writeback.c
index 40308e98c6a4..91013ff7dd53 100644
--- a/trunk/fs/fs-writeback.c
+++ b/trunk/fs/fs-writeback.c
@@ -64,28 +64,6 @@ static void writeback_release(struct backing_dev_info *bdi)
 	clear_bit(BDI_pdflush, &bdi->state);
 }
 
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
-{
-	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
-		struct dentry *dentry;
-		const char *name = "?";
-
-		dentry = d_find_alias(inode);
-		if (dentry) {
-			spin_lock(&dentry->d_lock);
-			name = (const char *) dentry->d_name.name;
-		}
-		printk(KERN_DEBUG
-		       "%s(%d): dirtied inode %lu (%s) on %s\n",
-		       current->comm, task_pid_nr(current), inode->i_ino,
-		       name, inode->i_sb->s_id);
-		if (dentry) {
-			spin_unlock(&dentry->d_lock);
-			dput(dentry);
-		}
-	}
-}
-
 /**
  *	__mark_inode_dirty -	internal function
  *	@inode: inode to mark
@@ -136,8 +114,23 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) == flags)
 		return;
 
-	if (unlikely(block_dump))
-		block_dump___mark_inode_dirty(inode);
+	if (unlikely(block_dump)) {
+		struct dentry *dentry = NULL;
+		const char *name = "?";
+
+		if (!list_empty(&inode->i_dentry)) {
+			dentry = list_entry(inode->i_dentry.next,
+					    struct dentry, d_alias);
+			if (dentry && dentry->d_name.name)
+				name = (const char *) dentry->d_name.name;
+		}
+
+		if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
+			printk(KERN_DEBUG
+			       "%s(%d): dirtied inode %lu (%s) on %s\n",
+			       current->comm, task_pid_nr(current), inode->i_ino,
+			       name, inode->i_sb->s_id);
+	}
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
@@ -296,6 +289,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret;
 
 	BUG_ON(inode->i_state & I_SYNC);
+	WARN_ON(inode->i_state & I_NEW);
 
 	/* Set I_SYNC, reset I_DIRTY */
 	dirty = inode->i_state & I_DIRTY;
@@ -320,6 +314,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (!(inode->i_state & I_DIRTY) &&
@@ -683,6 +678,55 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 	sync_sb_inodes(sb, &wbc);
 }
 
+/**
+ * sync_inodes - writes all inodes to disk
+ * @wait: wait for completion
+ *
+ * sync_inodes() goes through each super block's dirty inode list, writes the
+ * inodes out, waits on the writeout and puts the inodes back on the normal
+ * list.
+ *
+ * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
+ * part of the sync functions is that the blockdev "superblock" is processed
+ * last.  This is because the write_inode() function of a typical fs will
+ * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
+ * What we want to do is to perform all that dirtying first, and then write
+ * back all those inode blocks via the blockdev mapping in one sweep.  So the
+ * additional (somewhat redundant) sync_blockdev() calls here are to make
+ * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
+ * outstanding dirty inodes, the writeback goes block-at-a-time within the
+ * filesystem's write_inode().  This is extremely slow.
+ */
+static void __sync_inodes(int wait)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root) {
+			sync_inodes_sb(sb, wait);
+			sync_blockdev(sb->s_bdev);
+		}
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+
+void sync_inodes(int wait)
+{
+	__sync_inodes(0);
+
+	if (wait)
+		__sync_inodes(1);
+}
+
 /**
  * write_inode_now	-	write an inode to disk
  * @inode: inode to write to disk
diff --git a/trunk/fs/gfs2/log.c b/trunk/fs/gfs2/log.c
index f2e449c595b4..aa62cf5976e8 100644
--- a/trunk/fs/gfs2/log.c
+++ b/trunk/fs/gfs2/log.c
@@ -764,6 +764,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	gfs2_log_unlock(sdp);
 
+	sdp->sd_vfs->s_dirt = 0;
 	up_write(&sdp->sd_log_flush_lock);
 
 	kfree(ai);
@@ -822,6 +823,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	log_refund(sdp, tr);
 	buf_lo_incore_commit(sdp, tr);
 
+	sdp->sd_vfs->s_dirt = 1;
 	up_read(&sdp->sd_log_flush_lock);
 
 	gfs2_log_lock(sdp);
diff --git a/trunk/fs/gfs2/super.c b/trunk/fs/gfs2/super.c
index c8930b31cdf0..40bcc37e5a70 100644
--- a/trunk/fs/gfs2/super.c
+++ b/trunk/fs/gfs2/super.c
@@ -719,8 +719,6 @@ static void gfs2_put_super(struct super_block *sb)
 	int error;
 	struct gfs2_jdesc *jd;
 
-	lock_kernel();
-
 	/*  Unfreeze the filesystem, if we need to  */
 
 	mutex_lock(&sdp->sd_freeze_lock);
@@ -787,8 +785,17 @@ static void gfs2_put_super(struct super_block *sb)
 
 	/*  At this point, we're through participating in the lockspace  */
 	gfs2_sys_fs_del(sdp);
+}
+
+/**
+ * gfs2_write_super
+ * @sb: the superblock
+ *
+ */
 
-	unlock_kernel();
+static void gfs2_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
 }
 
 /**
@@ -800,6 +807,7 @@ static void gfs2_put_super(struct super_block *sb)
 
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
+	sb->s_dirt = 0;
 	if (wait && sb->s_fs_info)
 		gfs2_log_flush(sb->s_fs_info, NULL);
 	return 0;
@@ -1316,6 +1324,7 @@ const struct super_operations gfs2_super_ops = {
 	.write_inode		= gfs2_write_inode,
 	.delete_inode		= gfs2_delete_inode,
 	.put_super		= gfs2_put_super,
+	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
 	.freeze_fs 		= gfs2_freeze,
 	.unfreeze_fs		= gfs2_unfreeze,
diff --git a/trunk/fs/hfs/super.c b/trunk/fs/hfs/super.c
index 6f833dc8e910..a36bb749926d 100644
--- a/trunk/fs/hfs/super.c
+++ b/trunk/fs/hfs/super.c
@@ -49,23 +49,11 @@ MODULE_LICENSE("GPL");
  */
 static void hfs_write_super(struct super_block *sb)
 {
-	lock_super(sb);
 	sb->s_dirt = 0;
-
+	if (sb->s_flags & MS_RDONLY)
+		return;
 	/* sync everything to the buffers */
-	if (!(sb->s_flags & MS_RDONLY))
-		hfs_mdb_commit(sb);
-	unlock_super(sb);
-}
-
-static int hfs_sync_fs(struct super_block *sb, int wait)
-{
-	lock_super(sb);
 	hfs_mdb_commit(sb);
-	sb->s_dirt = 0;
-	unlock_super(sb);
-
-	return 0;
 }
 
 /*
@@ -77,15 +65,9 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
  */
 static void hfs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
-	if (sb->s_dirt)
-		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
-
-	unlock_kernel();
 }
 
 /*
@@ -182,7 +164,6 @@ static const struct super_operations hfs_super_operations = {
 	.clear_inode	= hfs_clear_inode,
 	.put_super	= hfs_put_super,
 	.write_super	= hfs_write_super,
-	.sync_fs	= hfs_sync_fs,
 	.statfs		= hfs_statfs,
 	.remount_fs     = hfs_remount,
 	.show_options	= hfs_show_options,
diff --git a/trunk/fs/hfsplus/super.c b/trunk/fs/hfsplus/super.c
index 9fc3af0c0dab..f2a64020f42e 100644
--- a/trunk/fs/hfsplus/super.c
+++ b/trunk/fs/hfsplus/super.c
@@ -152,14 +152,15 @@ static void hfsplus_clear_inode(struct inode *inode)
 	}
 }
 
-static int hfsplus_sync_fs(struct super_block *sb, int wait)
+static void hfsplus_write_super(struct super_block *sb)
 {
 	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
-
-	lock_super(sb);
 	sb->s_dirt = 0;
+	if (sb->s_flags & MS_RDONLY)
+		/* warn? */
+		return;
 
 	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
 	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -191,16 +192,6 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 		}
 		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
-	unlock_super(sb);
-	return 0;
-}
-
-static void hfsplus_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		hfsplus_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
 }
 
 static void hfsplus_put_super(struct super_block *sb)
@@ -208,11 +199,6 @@ static void hfsplus_put_super(struct super_block *sb)
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
 	if (!sb->s_fs_info)
 		return;
-
-	lock_kernel();
-
-	if (sb->s_dirt)
-		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
@@ -232,8 +218,6 @@ static void hfsplus_put_super(struct super_block *sb)
 		unload_nls(HFSPLUS_SB(sb).nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -295,7 +279,6 @@ static const struct super_operations hfsplus_sops = {
 	.clear_inode	= hfsplus_clear_inode,
 	.put_super	= hfsplus_put_super,
 	.write_super	= hfsplus_write_super,
-	.sync_fs	= hfsplus_sync_fs,
 	.statfs		= hfsplus_statfs,
 	.remount_fs	= hfsplus_remount,
 	.show_options	= hfsplus_show_options,
diff --git a/trunk/fs/hpfs/super.c b/trunk/fs/hpfs/super.c
index f2feaa06bf26..fc77965be841 100644
--- a/trunk/fs/hpfs/super.c
+++ b/trunk/fs/hpfs/super.c
@@ -13,7 +13,6 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -100,16 +99,11 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 static void hpfs_put_super(struct super_block *s)
 {
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
-
-	lock_kernel();
-
 	kfree(sbi->sb_cp_table);
 	kfree(sbi->sb_bmp_dir);
 	unmark_dirty(s);
 	s->s_fs_info = NULL;
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -399,8 +393,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
-	lock_kernel();
-	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
 	lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -433,13 +425,9 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	replace_mount_options(s, new_opts);
 
-	unlock_super(s);
-	unlock_kernel();
 	return 0;
 
 out_err:
-	unlock_super(s);
-	unlock_kernel();
 	kfree(new_opts);
 	return -EINVAL;
 }
diff --git a/trunk/fs/inode.c b/trunk/fs/inode.c
index a88baebf77cf..bca0c618fdb3 100644
--- a/trunk/fs/inode.c
+++ b/trunk/fs/inode.c
@@ -22,7 +22,6 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
-#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
 
@@ -190,10 +189,6 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 
-#ifdef CONFIG_FSNOTIFY
-	inode->i_fsnotify_mask = 0;
-#endif
-
 	return inode;
 
 out_free_security:
@@ -226,7 +221,6 @@ void destroy_inode(struct inode *inode)
 	BUG_ON(inode_has_buffers(inode));
 	ima_inode_free(inode);
 	security_inode_free(inode);
-	fsnotify_inode_delete(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
@@ -258,9 +252,6 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	mutex_init(&inode->inotify_mutex);
 #endif
-#ifdef CONFIG_FSNOTIFY
-	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
-#endif
 }
 EXPORT_SYMBOL(inode_init_once);
 
@@ -407,7 +398,6 @@ int invalidate_inodes(struct super_block *sb)
 	mutex_lock(&iprune_mutex);
 	spin_lock(&inode_lock);
 	inotify_unmount_inodes(&sb->s_inodes);
-	fsnotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
@@ -1422,7 +1412,7 @@ void file_update_time(struct file *file)
 	if (IS_NOCMTIME(inode))
 		return;
 
-	err = mnt_want_write_file(file);
+	err = mnt_want_write(file->f_path.mnt);
 	if (err)
 		return;
 
diff --git a/trunk/fs/internal.h b/trunk/fs/internal.h
index d55ef562f0bb..b4dac4fb6b61 100644
--- a/trunk/fs/internal.h
+++ b/trunk/fs/internal.h
@@ -25,8 +25,6 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 	return sb == blockdev_superblock;
 }
 
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-
 #else
 static inline void bdev_cache_init(void)
 {
@@ -36,11 +34,6 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
 	return 0;
 }
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
-	return 0;
-}
 #endif
 
 /*
@@ -73,13 +66,3 @@ extern void __init mnt_init(void);
  * fs_struct.c
  */
 extern void chroot_fs_refs(struct path *, struct path *);
-
-/*
- * file_table.c
- */
-extern void mark_files_ro(struct super_block *);
-
-/*
- * super.c
- */
-extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/trunk/fs/isofs/inode.c b/trunk/fs/isofs/inode.c
index 068b34b5a107..b4cbe9603c7d 100644
--- a/trunk/fs/isofs/inode.c
+++ b/trunk/fs/isofs/inode.c
@@ -42,16 +42,11 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
 static void isofs_put_super(struct super_block *sb)
 {
 	struct isofs_sb_info *sbi = ISOFS_SB(sb);
-
 #ifdef CONFIG_JOLIET
-	lock_kernel();
-
 	if (sbi->s_nls_iocharset) {
 		unload_nls(sbi->s_nls_iocharset);
 		sbi->s_nls_iocharset = NULL;
 	}
-
-	unlock_kernel();
 #endif
 
 	kfree(sbi);
diff --git a/trunk/fs/jffs2/fs.c b/trunk/fs/jffs2/fs.c
index 3451a81b2142..249305d65d5b 100644
--- a/trunk/fs/jffs2/fs.c
+++ b/trunk/fs/jffs2/fs.c
@@ -20,7 +20,6 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include "nodelist.h"
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -388,7 +387,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	   This also catches the case where it was stopped and this
 	   is just a remount to restart it.
 	   Flush the writebuffer, if neccecary, else we loose it */
-	lock_kernel();
 	if (!(sb->s_flags & MS_RDONLY)) {
 		jffs2_stop_garbage_collect_thread(c);
 		mutex_lock(&c->alloc_sem);
@@ -401,10 +399,24 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 
 	*flags |= MS_NOATIME;
 
-	unlock_kernel();
 	return 0;
 }
 
+void jffs2_write_super (struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	sb->s_dirt = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+	jffs2_garbage_collect_trigger(c);
+	jffs2_erase_pending_blocks(c, 0);
+	jffs2_flush_wbuf_gc(c, 0);
+}
+
+
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
    fill in the raw_inode while you're at it. */
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/trunk/fs/jffs2/os-linux.h b/trunk/fs/jffs2/os-linux.h
index 2228380c47b9..5e194a5c8e29 100644
--- a/trunk/fs/jffs2/os-linux.h
+++ b/trunk/fs/jffs2/os-linux.h
@@ -181,6 +181,7 @@ void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
+void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/trunk/fs/jffs2/super.c b/trunk/fs/jffs2/super.c
index 07a22caf2687..4c4e18c54a51 100644
--- a/trunk/fs/jffs2/super.c
+++ b/trunk/fs/jffs2/super.c
@@ -53,29 +53,10 @@ static void jffs2_i_init_once(void *foo)
 	inode_init_once(&f->vfs_inode);
 }
 
-static void jffs2_write_super(struct super_block *sb)
-{
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-
-	lock_super(sb);
-	sb->s_dirt = 0;
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-		jffs2_garbage_collect_trigger(c);
-		jffs2_erase_pending_blocks(c, 0);
-		jffs2_flush_wbuf_gc(c, 0);
-	}
-
-	unlock_super(sb);
-}
-
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
-	jffs2_write_super(sb);
-
 	mutex_lock(&c->alloc_sem);
 	jffs2_flush_wbuf_pad(c);
 	mutex_unlock(&c->alloc_sem);
@@ -193,11 +174,6 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
-	lock_kernel();
-
-	if (sb->s_dirt)
-		jffs2_write_super(sb);
-
 	mutex_lock(&c->alloc_sem);
 	jffs2_flush_wbuf_pad(c);
 	mutex_unlock(&c->alloc_sem);
@@ -216,8 +192,6 @@ static void jffs2_put_super (struct super_block *sb)
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
-	unlock_kernel();
-
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
 
diff --git a/trunk/fs/jfs/super.c b/trunk/fs/jfs/super.c
index 09b1b6ee2186..d9b0e92b3602 100644
--- a/trunk/fs/jfs/super.c
+++ b/trunk/fs/jfs/super.c
@@ -32,7 +32,6 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -184,9 +183,6 @@ static void jfs_put_super(struct super_block *sb)
 	int rc;
 
 	jfs_info("In jfs_put_super");
-
-	lock_kernel();
-
 	rc = jfs_umount(sb);
 	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
@@ -199,8 +195,6 @@ static void jfs_put_super(struct super_block *sb)
 	sbi->direct_inode = NULL;
 
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 enum {
@@ -376,24 +370,19 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	s64 newLVSize = 0;
 	int rc = 0;
 	int flag = JFS_SBI(sb)->flag;
-	int ret;
 
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
-	lock_kernel();
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_ERR
 		  "JFS: resize requires volume to be mounted read-write\n");
-			unlock_kernel();
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
-		if (rc) {
-			unlock_kernel();
+		if (rc)
 			return rc;
-		}
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -404,31 +393,23 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
 
 		JFS_SBI(sb)->flag = flag;
-		ret = jfs_mount_rw(sb, 1);
-		unlock_kernel();
-		return ret;
+		return jfs_mount_rw(sb, 1);
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
-		unlock_kernel();
 		return rc;
 	}
 	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
 		if (!(sb->s_flags & MS_RDONLY)) {
 			rc = jfs_umount_rw(sb);
-			if (rc) {
-				unlock_kernel();
+			if (rc)
 				return rc;
-			}
 			JFS_SBI(sb)->flag = flag;
-			ret = jfs_mount_rw(sb, 1);
-			unlock_kernel();
-			return ret;
+			return jfs_mount_rw(sb, 1);
 		}
 	JFS_SBI(sb)->flag = flag;
 
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/trunk/fs/libfs.c b/trunk/fs/libfs.c
index ddfa89948c3f..80046ddf5063 100644
--- a/trunk/fs/libfs.c
+++ b/trunk/fs/libfs.c
@@ -9,8 +9,6 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
-#include <linux/writeback.h>
-#include <linux/buffer_head.h>
 
 #include <asm/uaccess.h>
 
@@ -809,29 +807,6 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0, /* metadata-only; caller takes care of data */
-	};
-	struct inode *inode = dentry->d_inode;
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = sync_inode(inode, &wbc);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
-EXPORT_SYMBOL(simple_fsync);
-
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/trunk/fs/minix/dir.c b/trunk/fs/minix/dir.c
index e5f206467e40..d4946c4c90e2 100644
--- a/trunk/fs/minix/dir.c
+++ b/trunk/fs/minix/dir.c
@@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= minix_sync_file,
 };
 
 static inline void dir_put_page(struct page *page)
diff --git a/trunk/fs/minix/file.c b/trunk/fs/minix/file.c
index 3eec3e607a87..17765f697e50 100644
--- a/trunk/fs/minix/file.c
+++ b/trunk/fs/minix/file.c
@@ -6,12 +6,15 @@
  *  minix regular file handling primitives
  */
 
+#include <linux/buffer_head.h>		/* for fsync_inode_buffers() */
 #include "minix.h"
 
 /*
  * We have mostly NULLs here: the current defaults are OK for
  * the minix filesystem.
  */
+int minix_sync_file(struct file *, struct dentry *, int);
+
 const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -19,7 +22,7 @@ const struct file_operations minix_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_fsync,
+	.fsync		= minix_sync_file,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -27,3 +30,18 @@ const struct inode_operations minix_file_inode_operations = {
 	.truncate	= minix_truncate,
 	.getattr	= minix_getattr,
 };
+
+int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return err;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return err;
+	
+	err |= minix_sync_inode(inode);
+	return err ? -EIO : 0;
+}
diff --git a/trunk/fs/minix/inode.c b/trunk/fs/minix/inode.c
index f91a23693597..daad3c2740db 100644
--- a/trunk/fs/minix/inode.c
+++ b/trunk/fs/minix/inode.c
@@ -35,8 +35,6 @@ static void minix_put_super(struct super_block *sb)
 	int i;
 	struct minix_sb_info *sbi = minix_sb(sb);
 
-	lock_kernel();
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_version != MINIX_V3)	 /* s_state is now out from V3 sb */
 			sbi->s_ms->s_state = sbi->s_mount_state;
@@ -51,7 +49,7 @@ static void minix_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 
-	unlock_kernel();
+	return;
 }
 
 static struct kmem_cache * minix_inode_cachep;
@@ -556,25 +554,38 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	return bh;
 }
 
-static int minix_write_inode(struct inode *inode, int wait)
+static struct buffer_head *minix_update_inode(struct inode *inode)
+{
+	if (INODE_VERSION(inode) == MINIX_V1)
+		return V1_minix_update_inode(inode);
+	else
+		return V2_minix_update_inode(inode);
+}
+
+static int minix_write_inode(struct inode * inode, int wait)
+{
+	brelse(minix_update_inode(inode));
+	return 0;
+}
+
+int minix_sync_inode(struct inode * inode)
 {
 	int err = 0;
 	struct buffer_head *bh;
 
-	if (INODE_VERSION(inode) == MINIX_V1)
-		bh = V1_minix_update_inode(inode);
-	else
-		bh = V2_minix_update_inode(inode);
-	if (!bh)
-		return -EIO;
-	if (wait && buffer_dirty(bh)) {
+	bh = minix_update_inode(inode);
+	if (bh && buffer_dirty(bh))
+	{
 		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+		{
 			printk("IO error syncing minix inode [%s:%08lx]\n",
 				inode->i_sb->s_id, inode->i_ino);
-			err = -EIO;
+			err = -1;
 		}
 	}
+	else if (!bh)
+		err = -1;
 	brelse (bh);
 	return err;
 }
diff --git a/trunk/fs/minix/minix.h b/trunk/fs/minix/minix.h
index cb7fdd11f9a5..e6a0b193bea4 100644
--- a/trunk/fs/minix/minix.h
+++ b/trunk/fs/minix/minix.h
@@ -57,6 +57,7 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
 extern void minix_truncate(struct inode *);
+extern int minix_sync_inode(struct inode *);
 extern void minix_set_inode(struct inode *, dev_t);
 extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
 extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -71,6 +72,7 @@ extern int minix_empty_dir(struct inode*);
 extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
 extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
 extern ino_t minix_inode_by_name(struct dentry*);
+extern int minix_sync_file(struct file *, struct dentry *, int);
 
 extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c
index 527119afb6a5..c82805d088e1 100644
--- a/trunk/fs/namei.c
+++ b/trunk/fs/namei.c
@@ -552,17 +552,6 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
 	return result;
 }
 
-static __always_inline void set_root(struct nameidata *nd)
-{
-	if (!nd->root.mnt) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->root = fs->root;
-		path_get(&nd->root);
-		read_unlock(&fs->lock);
-	}
-}
-
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
 	int res = 0;
@@ -571,10 +560,14 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		goto fail;
 
 	if (*link == '/') {
-		set_root(nd);
+		struct fs_struct *fs = current->fs;
+
 		path_put(&nd->path);
-		nd->path = nd->root;
-		path_get(&nd->root);
+
+		read_lock(&fs->lock);
+		nd->path = fs->root;
+		path_get(&fs->root);
+		read_unlock(&fs->lock);
 	}
 
 	res = link_path_walk(link, nd);
@@ -675,23 +668,23 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 	return err;
 }
 
-int follow_up(struct path *path)
+int follow_up(struct vfsmount **mnt, struct dentry **dentry)
 {
 	struct vfsmount *parent;
 	struct dentry *mountpoint;
 	spin_lock(&vfsmount_lock);
-	parent = path->mnt->mnt_parent;
-	if (parent == path->mnt) {
+	parent=(*mnt)->mnt_parent;
+	if (parent == *mnt) {
 		spin_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(parent);
-	mountpoint = dget(path->mnt->mnt_mountpoint);
+	mountpoint=dget((*mnt)->mnt_mountpoint);
 	spin_unlock(&vfsmount_lock);
-	dput(path->dentry);
-	path->dentry = mountpoint;
-	mntput(path->mnt);
-	path->mnt = parent;
+	dput(*dentry);
+	*dentry = mountpoint;
+	mntput(*mnt);
+	*mnt = parent;
 	return 1;
 }
 
@@ -702,7 +695,7 @@ static int __follow_mount(struct path *path)
 {
 	int res = 0;
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
+		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -715,32 +708,32 @@ static int __follow_mount(struct path *path)
 	return res;
 }
 
-static void follow_mount(struct path *path)
+static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
+	while (d_mountpoint(*dentry)) {
+		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
 		if (!mounted)
 			break;
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
+		dput(*dentry);
+		mntput(*mnt);
+		*mnt = mounted;
+		*dentry = dget(mounted->mnt_root);
 	}
 }
 
 /* no need for dcache_lock, as serialization is taken care in
  * namespace.c
  */
-int follow_down(struct path *path)
+int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(path);
+	mounted = lookup_mnt(*mnt, *dentry);
 	if (mounted) {
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
+		dput(*dentry);
+		mntput(*mnt);
+		*mnt = mounted;
+		*dentry = dget(mounted->mnt_root);
 		return 1;
 	}
 	return 0;
@@ -748,16 +741,19 @@ int follow_down(struct path *path)
 
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-	set_root(nd);
+	struct fs_struct *fs = current->fs;
 
 	while(1) {
 		struct vfsmount *parent;
 		struct dentry *old = nd->path.dentry;
 
-		if (nd->path.dentry == nd->root.dentry &&
-		    nd->path.mnt == nd->root.mnt) {
+                read_lock(&fs->lock);
+		if (nd->path.dentry == fs->root.dentry &&
+		    nd->path.mnt == fs->root.mnt) {
+                        read_unlock(&fs->lock);
 			break;
 		}
+                read_unlock(&fs->lock);
 		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -779,7 +775,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 		mntput(nd->path.mnt);
 		nd->path.mnt = parent;
 	}
-	follow_mount(&nd->path);
+	follow_mount(&nd->path.mnt, &nd->path.dentry);
 }
 
 /*
@@ -1021,23 +1017,25 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return link_path_walk(name, nd);
 }
 
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
+	struct fs_struct *fs = current->fs;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 	nd->depth = 0;
-	nd->root.mnt = NULL;
 
 	if (*name=='/') {
-		set_root(nd);
-		nd->path = nd->root;
-		path_get(&nd->root);
+		read_lock(&fs->lock);
+		nd->path = fs->root;
+		path_get(&fs->root);
+		read_unlock(&fs->lock);
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
 		read_lock(&fs->lock);
 		nd->path = fs->pwd;
 		path_get(&fs->pwd);
@@ -1065,29 +1063,17 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 
 		fput_light(file, fput_needed);
 	}
-	return 0;
 
-fput_fail:
-	fput_light(file, fput_needed);
-out_fail:
-	return retval;
-}
-
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
-				unsigned int flags, struct nameidata *nd)
-{
-	int retval = path_init(dfd, name, flags, nd);
-	if (!retval)
-		retval = path_walk(name, nd);
+	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
-	if (nd->root.mnt) {
-		path_put(&nd->root);
-		nd->root.mnt = NULL;
-	}
+out_fail:
 	return retval;
+
+fput_fail:
+	fput_light(file, fput_needed);
+	goto out_fail;
 }
 
 int path_lookup(const char *name, unsigned int flags,
@@ -1127,18 +1113,14 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->path.dentry = dentry;
 	nd->path.mnt = mnt;
 	path_get(&nd->path);
-	nd->root = nd->path;
-	path_get(&nd->root);
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
 
-	path_put(&nd->root);
-	nd->root.mnt = NULL;
-
 	return retval;
+
 }
 
 /**
@@ -1694,14 +1676,9 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
 	if (error)
 		return ERR_PTR(error);
-	error = path_walk(pathname, &nd);
-	if (error)
-		return ERR_PTR(error);
-	if (unlikely(!audit_dummy_context()))
-		audit_inode(pathname, nd.path.dentry);
 
 	/*
 	 * We have the parent and last component. First of all, check
@@ -1829,8 +1806,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
 exit_parent:
-	if (nd.root.mnt)
-		path_put(&nd.root);
 	path_put(&nd.path);
 	return ERR_PTR(error);
 
diff --git a/trunk/fs/namespace.c b/trunk/fs/namespace.c
index 2dd333b0fe7f..134d494158d9 100644
--- a/trunk/fs/namespace.c
+++ b/trunk/fs/namespace.c
@@ -131,20 +131,10 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
-#ifdef CONFIG_SMP
-		mnt->mnt_writers = alloc_percpu(int);
-		if (!mnt->mnt_writers)
-			goto out_free_devname;
-#else
-		mnt->mnt_writers = 0;
-#endif
+		atomic_set(&mnt->__mnt_writers, 0);
 	}
 	return mnt;
 
-#ifdef CONFIG_SMP
-out_free_devname:
-	kfree(mnt->mnt_devname);
-#endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
@@ -181,38 +171,65 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-static inline void inc_mnt_writers(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
-#else
-	mnt->mnt_writers++;
-#endif
-}
+struct mnt_writer {
+	/*
+	 * If holding multiple instances of this lock, they
+	 * must be ordered by cpu number.
+	 */
+	spinlock_t lock;
+	struct lock_class_key lock_class; /* compiles out with !lockdep */
+	unsigned long count;
+	struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
 
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static int __init init_mnt_writers(void)
 {
-#ifdef CONFIG_SMP
-	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
-#else
-	mnt->mnt_writers--;
-#endif
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+		spin_lock_init(&writer->lock);
+		lockdep_set_class(&writer->lock, &writer->lock_class);
+		writer->count = 0;
+	}
+	return 0;
 }
+fs_initcall(init_mnt_writers);
 
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static void unlock_mnt_writers(void)
 {
-#ifdef CONFIG_SMP
-	unsigned int count = 0;
 	int cpu;
+	struct mnt_writer *cpu_writer;
 
 	for_each_possible_cpu(cpu) {
-		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_unlock(&cpu_writer->lock);
 	}
+}
 
-	return count;
-#else
-	return mnt->mnt_writers;
-#endif
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+	if (!cpu_writer->mnt)
+		return;
+	/*
+	 * This is in case anyone ever leaves an invalid,
+	 * old ->mnt and a count of 0.
+	 */
+	if (!cpu_writer->count)
+		return;
+	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+	cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+					  struct vfsmount *mnt)
+{
+	if (cpu_writer->mnt == mnt)
+		return;
+	__clear_mnt_count(cpu_writer);
+	cpu_writer->mnt = mnt;
 }
 
 /*
@@ -236,73 +253,74 @@ static unsigned int count_mnt_writers(struct vfsmount *mnt)
 int mnt_want_write(struct vfsmount *mnt)
 {
 	int ret = 0;
+	struct mnt_writer *cpu_writer;
 
-	preempt_disable();
-	inc_mnt_writers(mnt);
-	/*
-	 * The store to inc_mnt_writers must be visible before we pass
-	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
-	 * incremented count after it has set MNT_WRITE_HOLD.
-	 */
-	smp_mb();
-	while (mnt->mnt_flags & MNT_WRITE_HOLD)
-		cpu_relax();
-	/*
-	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
-	 * be set to match its requirements. So we must not load that until
-	 * MNT_WRITE_HOLD is cleared.
-	 */
-	smp_rmb();
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
 	if (__mnt_is_readonly(mnt)) {
-		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
+	use_cpu_writer_for_mount(cpu_writer, mnt);
+	cpu_writer->count++;
 out:
-	preempt_enable();
+	spin_unlock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-/**
- * mnt_clone_write - get write access to a mount
- * @mnt: the mount on which to take a write
- *
- * This is effectively like mnt_want_write, except
- * it must only be used to take an extra write reference
- * on a mountpoint that we already know has a write reference
- * on it. This allows some optimisation.
- *
- * After finished, mnt_drop_write must be called as usual to
- * drop the reference.
- */
-int mnt_clone_write(struct vfsmount *mnt)
-{
-	/* superblock may be r/o */
-	if (__mnt_is_readonly(mnt))
-		return -EROFS;
-	preempt_disable();
-	inc_mnt_writers(mnt);
-	preempt_enable();
-	return 0;
+static void lock_mnt_writers(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_lock(&cpu_writer->lock);
+		__clear_mnt_count(cpu_writer);
+		cpu_writer->mnt = NULL;
+	}
 }
-EXPORT_SYMBOL_GPL(mnt_clone_write);
 
-/**
- * mnt_want_write_file - get write access to a file's mount
- * @file: the file who's mount on which to take a write
- *
- * This is like mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
  */
-int mnt_want_write_file(struct file *file)
+static void handle_write_count_underflow(struct vfsmount *mnt)
 {
-	if (!(file->f_mode & FMODE_WRITE))
-		return mnt_want_write(file->f_path.mnt);
-	else
-		return mnt_clone_write(file->f_path.mnt);
+	if (atomic_read(&mnt->__mnt_writers) >=
+	    MNT_WRITER_UNDERFLOW_LIMIT)
+		return;
+	/*
+	 * It isn't necessary to hold all of the locks
+	 * at the same time, but doing it this way makes
+	 * us share a lot more code.
+	 */
+	lock_mnt_writers();
+	/*
+	 * vfsmount_lock is for mnt_flags.
+	 */
+	spin_lock(&vfsmount_lock);
+	/*
+	 * If coalescing the per-cpu writer counts did not
+	 * get us back to a positive writer count, we have
+	 * a bug.
+	 */
+	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
+				"count: %d\n",
+			mnt, atomic_read(&mnt->__mnt_writers));
+		/* use the flag to keep the dmesg spam down */
+		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+	}
+	spin_unlock(&vfsmount_lock);
+	unlock_mnt_writers();
 }
-EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -314,9 +332,37 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	preempt_disable();
-	dec_mnt_writers(mnt);
-	preempt_enable();
+	int must_check_underflow = 0;
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+
+	use_cpu_writer_for_mount(cpu_writer, mnt);
+	if (cpu_writer->count > 0) {
+		cpu_writer->count--;
+	} else {
+		must_check_underflow = 1;
+		atomic_dec(&mnt->__mnt_writers);
+	}
+
+	spin_unlock(&cpu_writer->lock);
+	/*
+	 * Logically, we could call this each time,
+	 * but the __mnt_writers cacheline tends to
+	 * be cold, and makes this expensive.
+	 */
+	if (must_check_underflow)
+		handle_write_count_underflow(mnt);
+	/*
+	 * This could be done right after the spinlock
+	 * is taken because the spinlock keeps us on
+	 * the cpu, and disables preemption.  However,
+	 * putting it here bounds the amount that
+	 * __mnt_writers can underflow.  Without it,
+	 * we could theoretically wrap __mnt_writers.
+	 */
+	put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
@@ -324,41 +370,24 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
-	spin_lock(&vfsmount_lock);
-	mnt->mnt_flags |= MNT_WRITE_HOLD;
+	lock_mnt_writers();
 	/*
-	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
-	 * should be visible before we do.
+	 * With all the locks held, this value is stable
 	 */
-	smp_mb();
-
-	/*
-	 * With writers on hold, if this value is zero, then there are
-	 * definitely no active writers (although held writers may subsequently
-	 * increment the count, they'll have to wait, and decrement it after
-	 * seeing MNT_READONLY).
-	 *
-	 * It is OK to have counter incremented on one CPU and decremented on
-	 * another: the sum will add up correctly. The danger would be when we
-	 * sum up each counter, if we read a counter before it is incremented,
-	 * but then read another CPU's count which it has been subsequently
-	 * decremented from -- we would see more decrements than we should.
-	 * MNT_WRITE_HOLD protects against this scenario, because
-	 * mnt_want_write first increments count, then smp_mb, then spins on
-	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
-	 * we're counting up here.
-	 */
-	if (count_mnt_writers(mnt) > 0)
+	if (atomic_read(&mnt->__mnt_writers) > 0) {
 		ret = -EBUSY;
-	else
-		mnt->mnt_flags |= MNT_READONLY;
+		goto out;
+	}
 	/*
-	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
-	 * that become unheld will see MNT_READONLY.
+	 * nobody can do a successful mnt_want_write() with all
+	 * of the counts in MNT_DENIED_WRITE and the locks held.
 	 */
-	smp_wmb();
-	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	spin_lock(&vfsmount_lock);
+	if (!ret)
+		mnt->mnt_flags |= MNT_READONLY;
 	spin_unlock(&vfsmount_lock);
+out:
+	unlock_mnt_writers();
 	return ret;
 }
 
@@ -381,9 +410,6 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
-#ifdef CONFIG_SMP
-	free_percpu(mnt->mnt_writers);
-#endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
@@ -416,11 +442,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
  * lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct path *path)
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct vfsmount *child_mnt;
 	spin_lock(&vfsmount_lock);
-	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
+	if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
 		mntget(child_mnt);
 	spin_unlock(&vfsmount_lock);
 	return child_mnt;
@@ -578,18 +604,38 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
 static inline void __mntput(struct vfsmount *mnt)
 {
+	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
+	/*
+	 * We don't have to hold all of the locks at the
+	 * same time here because we know that we're the
+	 * last reference to mnt and that no new writers
+	 * can come in.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_lock(&cpu_writer->lock);
+		if (cpu_writer->mnt != mnt) {
+			spin_unlock(&cpu_writer->lock);
+			continue;
+		}
+		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+		cpu_writer->count = 0;
+		/*
+		 * Might as well do this so that no one
+		 * ever sees the pointer and expects
+		 * it to be valid.
+		 */
+		cpu_writer->mnt = NULL;
+		spin_unlock(&cpu_writer->lock);
+	}
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
-	/*
-	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
-	 * provides barriers, so count_mnt_writers() below is safe.  AV
-	 */
-	WARN_ON(count_mnt_writers(mnt));
+	WARN_ON(atomic_read(&mnt->__mnt_writers));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
@@ -1060,8 +1106,11 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		 * we just try to remount it readonly.
 		 */
 		down_write(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY))
+		if (!(sb->s_flags & MS_RDONLY)) {
+			lock_kernel();
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
+			unlock_kernel();
+		}
 		up_write(&sb->s_umount);
 		return retval;
 	}
@@ -1204,11 +1253,11 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 	return NULL;
 }
 
-struct vfsmount *collect_mounts(struct path *path)
+struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct vfsmount *tree;
 	down_write(&namespace_sem);
-	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
+	tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
 	up_write(&namespace_sem);
 	return tree;
 }
@@ -1381,7 +1430,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 		goto out_unlock;
 
 	err = -ENOENT;
-	if (!d_unlinked(path->dentry))
+	if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
 		err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1552,7 +1601,7 @@ static int do_move_mount(struct path *path, char *old_name)
 
 	down_write(&namespace_sem);
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
+	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1563,7 +1612,7 @@ static int do_move_mount(struct path *path, char *old_name)
 	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out1;
 
-	if (d_unlinked(path->dentry))
+	if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
 		goto out1;
 
 	err = -EINVAL;
@@ -1627,9 +1676,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	lock_kernel();
 	mnt = do_kern_mount(type, flags, name, data);
-	unlock_kernel();
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -1648,10 +1695,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
+	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+	if (!check_mnt(path->mnt))
 		goto unlock;
 
 	/* Refuse the same filesystem on the same mount point */
@@ -2045,8 +2092,10 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	if (retval < 0)
 		goto out3;
 
+	lock_kernel();
 	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
 			  flags, (void *)data_page);
+	unlock_kernel();
 	free_page(data_page);
 
 out3:
@@ -2126,9 +2175,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = -ENOENT;
 	if (IS_DEADDIR(new.dentry->d_inode))
 		goto out2;
-	if (d_unlinked(new.dentry))
+	if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
 		goto out2;
-	if (d_unlinked(old.dentry))
+	if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
 		goto out2;
 	error = -EBUSY;
 	if (new.mnt == root.mnt ||
diff --git a/trunk/fs/ncpfs/inode.c b/trunk/fs/ncpfs/inode.c
index b99ce205b1bd..d642f0e5b365 100644
--- a/trunk/fs/ncpfs/inode.c
+++ b/trunk/fs/ncpfs/inode.c
@@ -736,8 +736,6 @@ static void ncp_put_super(struct super_block *sb)
 {
 	struct ncp_server *server = NCP_SBP(sb);
 
-	lock_kernel();
-
 	ncp_lock_server(server);
 	ncp_disconnect(server);
 	ncp_unlock_server(server);
@@ -771,8 +769,6 @@ static void ncp_put_super(struct super_block *sb)
 	vfree(server->packet);
 	sb->s_fs_info = NULL;
 	kfree(server);
-
-	unlock_kernel();
 }
 
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/trunk/fs/nfs/namespace.c b/trunk/fs/nfs/namespace.c
index f01caec84463..64a288ee046d 100644
--- a/trunk/fs/nfs/namespace.c
+++ b/trunk/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	goto out;
 out_follow:
 	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path))
+	       follow_down(&nd->path.mnt, &nd->path.dentry))
 		;
 	err = 0;
 	goto out;
diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c
index 26127b69a275..d2d67781c579 100644
--- a/trunk/fs/nfs/super.c
+++ b/trunk/fs/nfs/super.c
@@ -1813,7 +1813,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	if (data == NULL)
 		return -ENOMEM;
 
-	lock_kernel();
 	/* fill out struct with values from existing mount */
 	data->flags = nfss->flags;
 	data->rsize = nfss->rsize;
@@ -1838,7 +1837,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	error = nfs_compare_remount_data(nfss, data);
 out:
 	kfree(data);
-	unlock_kernel();
 	return error;
 }
 
diff --git a/trunk/fs/nfsd/export.c b/trunk/fs/nfsd/export.c
index 8b1f8efb4690..5839b229cd0e 100644
--- a/trunk/fs/nfsd/export.c
+++ b/trunk/fs/nfsd/export.c
@@ -847,8 +847,9 @@ exp_get_fsid_key(svc_client *clp, int fsid)
 	return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
 
-static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
-				     struct cache_req *reqp)
+static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
+				   struct dentry *dentry,
+				   struct cache_req *reqp)
 {
 	struct svc_export *exp, key;
 	int err;
@@ -857,7 +858,8 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
 		return ERR_PTR(-ENOENT);
 
 	key.ex_client = clp;
-	key.ex_path = *path;
+	key.ex_path.mnt = mnt;
+	key.ex_path.dentry = dentry;
 
 	exp = svc_export_lookup(&key);
 	if (exp == NULL)
@@ -871,19 +873,24 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
 /*
  * Find the export entry for a given dentry.
  */
-static struct svc_export *exp_parent(svc_client *clp, struct path *path)
+static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
+				     struct dentry *dentry,
+				     struct cache_req *reqp)
 {
-	struct dentry *saved = dget(path->dentry);
-	svc_export *exp = exp_get_by_name(clp, path, NULL);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
-		struct dentry *parent = dget_parent(path->dentry);
-		dput(path->dentry);
-		path->dentry = parent;
-		exp = exp_get_by_name(clp, path, NULL);
+	svc_export *exp;
+
+	dget(dentry);
+	exp = exp_get_by_name(clp, mnt, dentry, reqp);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+		struct dentry *parent;
+
+		parent = dget_parent(dentry);
+		dput(dentry);
+		dentry = parent;
+		exp = exp_get_by_name(clp, mnt, dentry, reqp);
 	}
-	dput(path->dentry);
-	path->dentry = saved;
+	dput(dentry);
 	return exp;
 }
 
@@ -1011,7 +1018,7 @@ exp_export(struct nfsctl_export *nxp)
 		goto out_put_clp;
 	err = -EINVAL;
 
-	exp = exp_get_by_name(clp, &path, NULL);
+	exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
 
 	memset(&new, 0, sizeof(new));
 
@@ -1128,7 +1135,7 @@ exp_unexport(struct nfsctl_export *nxp)
 		goto out_domain;
 
 	err = -EINVAL;
-	exp = exp_get_by_name(dom, &path, NULL);
+	exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
 	path_put(&path);
 	if (IS_ERR(exp))
 		goto out_domain;
@@ -1170,7 +1177,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
 	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
 		 name, path.dentry, clp->name,
 		 inode->i_sb->s_id, inode->i_ino);
-	exp = exp_parent(clp, &path);
+	exp = exp_parent(clp, path.mnt, path.dentry, NULL);
 	if (IS_ERR(exp)) {
 		err = PTR_ERR(exp);
 		goto out;
@@ -1200,7 +1207,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
 	if (IS_ERR(ek))
 		return ERR_CAST(ek);
 
-	exp = exp_get_by_name(clp, &ek->ek_path, reqp);
+	exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp);
 	cache_put(&ek->h, &svc_expkey_cache);
 
 	if (IS_ERR(exp))
@@ -1240,7 +1247,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
  * use exp_get_by_name() or exp_find().
  */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
+		struct dentry *dentry)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
 
@@ -1248,7 +1256,8 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
 		goto gss;
 
 	/* First try the auth_unix client: */
-	exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
+	exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+						&rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		goto gss;
 	if (IS_ERR(exp))
@@ -1260,7 +1269,8 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
 	/* Otherwise, try falling back on gss client */
 	if (rqstp->rq_gssclient == NULL)
 		return exp;
-	gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+						&rqstp->rq_chandle);
 	if (PTR_ERR(gssexp) == -ENOENT)
 		return exp;
 	if (!IS_ERR(exp))
@@ -1299,19 +1309,23 @@ rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
 }
 
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
+rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
+		struct dentry *dentry)
 {
-	struct dentry *saved = dget(path->dentry);
-	struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
-		struct dentry *parent = dget_parent(path->dentry);
-		dput(path->dentry);
-		path->dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, path);
+	struct svc_export *exp;
+
+	dget(dentry);
+	exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+		struct dentry *parent;
+
+		parent = dget_parent(dentry);
+		dput(dentry);
+		dentry = parent;
+		exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
 	}
-	dput(path->dentry);
-	path->dentry = saved;
+	dput(dentry);
 	return exp;
 }
 
diff --git a/trunk/fs/nfsd/vfs.c b/trunk/fs/nfsd/vfs.c
index 99f835753596..bd584bcf1d9f 100644
--- a/trunk/fs/nfsd/vfs.c
+++ b/trunk/fs/nfsd/vfs.c
@@ -101,35 +101,36 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
 	struct svc_export *exp = *expp, *exp2 = NULL;
 	struct dentry *dentry = *dpp;
-	struct path path = {.mnt = mntget(exp->ex_path.mnt),
-			    .dentry = dget(dentry)};
+	struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+	struct dentry *mounts = dget(dentry);
 	int err = 0;
 
-	while (d_mountpoint(path.dentry) && follow_down(&path))
-		;
+	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
 
-	exp2 = rqst_exp_get_by_name(rqstp, &path);
+	exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
 	if (IS_ERR(exp2)) {
 		if (PTR_ERR(exp2) != -ENOENT)
 			err = PTR_ERR(exp2);
-		path_put(&path);
+		dput(mounts);
+		mntput(mnt);
 		goto out;
 	}
 	if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
 		/* successfully crossed mount point */
 		/*
-		 * This is subtle: path.dentry is *not* on path.mnt
-		 * at this point.  The only reason we are safe is that
-		 * original mnt is pinned down by exp, so we should
-		 * put path *before* putting exp
+		 * This is subtle: dentry is *not* under mnt at this point.
+		 * The only reason we are safe is that original mnt is pinned
+		 * down by exp, so we should dput before putting exp.
 		 */
-		*dpp = path.dentry;
-		path.dentry = dentry;
+		dput(dentry);
+		*dpp = mounts;
+		exp_put(exp);
 		*expp = exp2;
-		exp2 = exp;
+	} else {
+		exp_put(exp2);
+		dput(mounts);
 	}
-	path_put(&path);
-	exp_put(exp2);
+	mntput(mnt);
 out:
 	return err;
 }
@@ -168,29 +169,28 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			/* checking mountpoint crossing is very different when stepping up */
 			struct svc_export *exp2 = NULL;
 			struct dentry *dp;
-			struct path path = {.mnt = mntget(exp->ex_path.mnt),
-					    .dentry = dget(dparent)};
-
-			while (path.dentry == path.mnt->mnt_root &&
-			       follow_up(&path))
+			struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+			dentry = dget(dparent);
+			while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
 				;
-			dp = dget_parent(path.dentry);
-			dput(path.dentry);
-			path.dentry = dp;
+			dp = dget_parent(dentry);
+			dput(dentry);
+			dentry = dp;
 
-			exp2 = rqst_exp_parent(rqstp, &path);
+			exp2 = rqst_exp_parent(rqstp, mnt, dentry);
 			if (PTR_ERR(exp2) == -ENOENT) {
+				dput(dentry);
 				dentry = dget(dparent);
 			} else if (IS_ERR(exp2)) {
 				host_err = PTR_ERR(exp2);
-				path_put(&path);
+				dput(dentry);
+				mntput(mnt);
 				goto out_nfserr;
 			} else {
-				dentry = dget(path.dentry);
 				exp_put(exp);
 				exp = exp2;
 			}
-			path_put(&path);
+			mntput(mnt);
 		}
 	} else {
 		fh_lock(fhp);
diff --git a/trunk/fs/nilfs2/cpfile.c b/trunk/fs/nilfs2/cpfile.c
index cadd36b14d07..300f1cdfa862 100644
--- a/trunk/fs/nilfs2/cpfile.c
+++ b/trunk/fs/nilfs2/cpfile.c
@@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 	case NILFS_CHECKPOINT:
 		/*
 		 * Check for protecting existing snapshot mounts:
-		 * ns_mount_mutex is used to make this operation atomic and
+		 * bd_mount_sem is used to make this operation atomic and
 		 * exclusive with a new mount job.  Though it doesn't cover
 		 * umount, it's enough for the purpose.
 		 */
-		mutex_lock(&nilfs->ns_mount_mutex);
+		down(&nilfs->ns_bdev->bd_mount_sem);
 		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
 			/* Current implementation does not have to protect
 			   plain read-only mounts since they are exclusive
@@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 			ret = -EBUSY;
 		} else
 			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
-		mutex_unlock(&nilfs->ns_mount_mutex);
+		up(&nilfs->ns_bdev->bd_mount_sem);
 		return ret;
 	case NILFS_SNAPSHOT:
 		return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/trunk/fs/nilfs2/sb.h b/trunk/fs/nilfs2/sb.h
index 0776ccc2504a..adccd4fc654e 100644
--- a/trunk/fs/nilfs2/sb.h
+++ b/trunk/fs/nilfs2/sb.h
@@ -60,7 +60,6 @@ struct nilfs_sb_info {
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
 	struct list_head s_list;	/* list head for nilfs->ns_supers */
-	atomic_t s_count;		/* reference count */
 
 	/* Segment constructor */
 	struct list_head s_dirty_files;	/* dirty files list */
diff --git a/trunk/fs/nilfs2/super.c b/trunk/fs/nilfs2/super.c
index 1777a3467bd2..6989b03e97ab 100644
--- a/trunk/fs/nilfs2/super.c
+++ b/trunk/fs/nilfs2/super.c
@@ -65,8 +65,9 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
-static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static int test_exclusive_mount(struct file_system_type *fs_type,
+				struct block_device *bdev, int flags);
 
 /**
  * nilfs_error() - report failure condition on a filesystem
@@ -314,11 +315,6 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	lock_kernel();
-
-	if (sb->s_dirt)
-		nilfs_write_super(sb);
-
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -327,18 +323,12 @@ static void nilfs_put_super(struct super_block *sb)
 		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	}
-	down_write(&nilfs->ns_super_sem);
-	if (nilfs->ns_current == sbi)
-		nilfs->ns_current = NULL;
-	up_write(&nilfs->ns_super_sem);
 
 	nilfs_detach_checkpoint(sbi);
 	put_nilfs(sbi->s_nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
-	nilfs_put_sbinfo(sbi);
-
-	unlock_kernel();
+	kfree(sbi);
 }
 
 /**
@@ -393,8 +383,6 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
 	int err = 0;
 
-	nilfs_write_super(sb);
-
 	/* This function is called when super block should be written back */
 	if (wait)
 		err = nilfs_construct_segment(sb);
@@ -408,9 +396,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	struct buffer_head *bh_cp;
 	int err;
 
-	down_write(&nilfs->ns_super_sem);
+	down_write(&nilfs->ns_sem);
 	list_add(&sbi->s_list, &nilfs->ns_supers);
-	up_write(&nilfs->ns_super_sem);
+	up_write(&nilfs->ns_sem);
 
 	sbi->s_ifile = nilfs_mdt_new(
 		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -448,9 +436,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
 
-	down_write(&nilfs->ns_super_sem);
+	down_write(&nilfs->ns_sem);
 	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
+	up_write(&nilfs->ns_sem);
 
 	return err;
 }
@@ -462,9 +450,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 	nilfs_mdt_clear(sbi->s_ifile);
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
-	down_write(&nilfs->ns_super_sem);
+	down_write(&nilfs->ns_sem);
 	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
+	up_write(&nilfs->ns_sem);
 }
 
 static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -764,7 +752,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
  * @silent: silent mode flag
  * @nilfs: the_nilfs struct
  *
- * This function is called exclusively by nilfs->ns_mount_mutex.
+ * This function is called exclusively by bd_mount_mutex.
  * So, the recovery process is protected from other simultaneous mounts.
  */
 static int
@@ -785,7 +773,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	get_nilfs(nilfs);
 	sbi->s_nilfs = nilfs;
 	sbi->s_super = sb;
-	atomic_set(&sbi->s_count, 1);
 
 	err = init_nilfs(nilfs, sbi, (char *)data);
 	if (err)
@@ -883,11 +870,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_root;
 	}
 
-	down_write(&nilfs->ns_super_sem);
-	if (!nilfs_test_opt(sbi, SNAPSHOT))
-		nilfs->ns_current = sbi;
-	up_write(&nilfs->ns_super_sem);
-
 	return 0;
 
  failed_root:
@@ -903,7 +885,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
  failed_sbi:
 	put_nilfs(nilfs);
 	sb->s_fs_info = NULL;
-	nilfs_put_sbinfo(sbi);
+	kfree(sbi);
 	return err;
 }
 
@@ -916,9 +898,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct nilfs_mount_options old_opts;
 	int err;
 
-	lock_kernel();
-
-	down_write(&nilfs->ns_super_sem);
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -966,12 +945,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * store the current valid flag.  (It may have been changed
 		 * by fsck since we originally mounted the partition.)
 		 */
-		if (nilfs->ns_current && nilfs->ns_current != sbi) {
+		down(&sb->s_bdev->bd_mount_sem);
+		/* Check existing RW-mount */
+		if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
-			       "remount because an RW-mount exists.\n",
+			       "remount because a RW-mount exists.\n",
 			       sb->s_id);
 			err = -EBUSY;
-			goto restore_opts;
+			goto rw_remount_failed;
 		}
 		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -979,7 +960,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 			       "the latest one.\n",
 			       sb->s_id);
 			err = -EINVAL;
-			goto restore_opts;
+			goto rw_remount_failed;
 		}
 		sb->s_flags &= ~MS_RDONLY;
 		nilfs_clear_opt(sbi, SNAPSHOT);
@@ -987,31 +968,28 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 		err = nilfs_attach_segment_constructor(sbi);
 		if (err)
-			goto restore_opts;
+			goto rw_remount_failed;
 
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
 		up_write(&nilfs->ns_sem);
 
-		nilfs->ns_current = sbi;
+		up(&sb->s_bdev->bd_mount_sem);
 	}
  out:
-	up_write(&nilfs->ns_super_sem);
-	unlock_kernel();
 	return 0;
 
+ rw_remount_failed:
+	up(&sb->s_bdev->bd_mount_sem);
  restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
-	up_write(&nilfs->ns_super_sem);
-	unlock_kernel();
 	return err;
 }
 
 struct nilfs_super_data {
 	struct block_device *bdev;
-	struct nilfs_sb_info *sbi;
 	__u64 cno;
 	int flags;
 };
@@ -1070,7 +1048,33 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
 	struct nilfs_super_data *sd = data;
 
-	return sd->sbi && s->s_fs_info == (void *)sd->sbi;
+	return s->s_bdev == sd->bdev;
+}
+
+static int nilfs_test_bdev_super2(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+	int ret;
+
+	if (s->s_bdev != sd->bdev)
+		return 0;
+
+	if (!((s->s_flags | sd->flags) & MS_RDONLY))
+		return 1; /* Reuse an old R/W-mode super_block */
+
+	if (s->s_flags & sd->flags & MS_RDONLY) {
+		if (down_read_trylock(&s->s_umount)) {
+			ret = s->s_root &&
+				(sd->cno == NILFS_SB(s)->s_snapshot_cno);
+			up_read(&s->s_umount);
+			/*
+			 * This path is locked with sb_lock by sget().
+			 * So, drop_super() causes deadlock.
+			 */
+			return ret;
+		}
+	}
+	return 0;
 }
 
 static int
@@ -1078,8 +1082,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	     const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	struct nilfs_super_data sd;
-	struct super_block *s;
-	struct the_nilfs *nilfs;
+	struct super_block *s, *s2;
+	struct the_nilfs *nilfs = NULL;
 	int err, need_to_close = 1;
 
 	sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1091,6 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 * much more information than normal filesystems to identify mount
 	 * instance.  For snapshot mounts, not only a mount type (ro-mount
 	 * or rw-mount) but also a checkpoint number is required.
+	 * The results are passed in sget() using nilfs_super_data.
 	 */
 	sd.cno = 0;
 	sd.flags = flags;
@@ -1099,59 +1104,64 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	nilfs = find_or_create_nilfs(sd.bdev);
-	if (!nilfs) {
-		err = -ENOMEM;
-		goto failed;
-	}
-
-	mutex_lock(&nilfs->ns_mount_mutex);
-
-	if (!sd.cno) {
-		/*
-		 * Check if an exclusive mount exists or not.
-		 * Snapshot mounts coexist with a current mount
-		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
-		 * ro-mount are mutually exclusive.
-		 */
-		down_read(&nilfs->ns_super_sem);
-		if (nilfs->ns_current &&
-		    ((nilfs->ns_current->s_super->s_flags ^ flags)
-		     & MS_RDONLY)) {
-			up_read(&nilfs->ns_super_sem);
-			err = -EBUSY;
-			goto failed_unlock;
-		}
-		up_read(&nilfs->ns_super_sem);
-	}
-
 	/*
-	 * Find existing nilfs_sb_info struct
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
 	 */
-	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-
-	if (!sd.cno)
-		/* trying to get the latest checkpoint.  */
-		sd.cno = nilfs_last_cno(nilfs);
+	down(&sd.bdev->bd_mount_sem);
+	if (!sd.cno &&
+	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
+		err = (err < 0) ? : -EBUSY;
+		goto failed_unlock;
+	}
 
 	/*
-	 * Get super block instance holding the nilfs_sb_info struct.
-	 * A new instance is allocated if no existing mount is present or
-	 * existing instance has been unmounted.
+	 * Phase-1: search any existent instance and get the_nilfs
 	 */
 	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-	if (sd.sbi)
-		nilfs_put_sbinfo(sd.sbi);
+	if (IS_ERR(s))
+		goto error_s;
 
-	if (IS_ERR(s)) {
-		err = PTR_ERR(s);
-		goto failed_unlock;
+	if (!s->s_root) {
+		err = -ENOMEM;
+		nilfs = alloc_nilfs(sd.bdev);
+		if (!nilfs)
+			goto cancel_new;
+	} else {
+		struct nilfs_sb_info *sbi = NILFS_SB(s);
+
+		/*
+		 * s_umount protects super_block from unmount process;
+		 * It covers pointers of nilfs_sb_info and the_nilfs.
+		 */
+		nilfs = sbi->s_nilfs;
+		get_nilfs(nilfs);
+		up_write(&s->s_umount);
+
+		/*
+		 * Phase-2: search specified snapshot or R/W mode super_block
+		 */
+		if (!sd.cno)
+			/* trying to get the latest checkpoint.  */
+			sd.cno = nilfs_last_cno(nilfs);
+
+		s2 = sget(fs_type, nilfs_test_bdev_super2,
+			  nilfs_set_bdev_super, &sd);
+		deactivate_super(s);
+		/*
+		 * Although deactivate_super() invokes close_bdev_exclusive() at
+		 * kill_block_super().  Here, s is an existent mount; we need
+		 * one more close_bdev_exclusive() call.
+		 */
+		s = s2;
+		if (IS_ERR(s))
+			goto error_s;
 	}
 
 	if (!s->s_root) {
 		char b[BDEVNAME_SIZE];
 
-		/* New superblock instance created */
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
@@ -1162,18 +1172,26 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 		s->s_flags |= MS_ACTIVE;
 		need_to_close = 0;
+	} else if (!(s->s_flags & MS_RDONLY)) {
+		err = -EBUSY;
 	}
 
-	mutex_unlock(&nilfs->ns_mount_mutex);
+	up(&sd.bdev->bd_mount_sem);
 	put_nilfs(nilfs);
 	if (need_to_close)
 		close_bdev_exclusive(sd.bdev, flags);
 	simple_set_mnt(mnt, s);
 	return 0;
 
+ error_s:
+	up(&sd.bdev->bd_mount_sem);
+	if (nilfs)
+		put_nilfs(nilfs);
+	close_bdev_exclusive(sd.bdev, flags);
+	return PTR_ERR(s);
+
  failed_unlock:
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
+	up(&sd.bdev->bd_mount_sem);
  failed:
 	close_bdev_exclusive(sd.bdev, flags);
 
@@ -1181,18 +1199,70 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
  cancel_new:
 	/* Abandoning the newly allocated superblock */
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
+	up(&sd.bdev->bd_mount_sem);
+	if (nilfs)
+		put_nilfs(nilfs);
 	up_write(&s->s_umount);
 	deactivate_super(s);
 	/*
 	 * deactivate_super() invokes close_bdev_exclusive().
 	 * We must finish all post-cleaning before this call;
-	 * put_nilfs() needs the block device.
+	 * put_nilfs() and unlocking bd_mount_sem need the block device.
 	 */
 	return err;
 }
 
+static int nilfs_test_bdev_super3(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+	int ret;
+
+	if (s->s_bdev != sd->bdev)
+		return 0;
+	if (down_read_trylock(&s->s_umount)) {
+		ret = (s->s_flags & MS_RDONLY) && s->s_root &&
+			nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
+		up_read(&s->s_umount);
+		if (ret)
+			return 0; /* ignore snapshot mounts */
+	}
+	return !((sd->flags ^ s->s_flags) & MS_RDONLY);
+}
+
+static int __false_bdev_super(struct super_block *s, void *data)
+{
+#if 0 /* XXX: workaround for lock debug. This is not good idea */
+	up_write(&s->s_umount);
+#endif
+	return -EFAULT;
+}
+
+/**
+ * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
+ * fs_type: filesystem type
+ * bdev: block device
+ * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
+ * res: pointer to an integer to store result
+ *
+ * This function must be called within a section protected by bd_mount_mutex.
+ */
+static int test_exclusive_mount(struct file_system_type *fs_type,
+				struct block_device *bdev, int flags)
+{
+	struct super_block *s;
+	struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
+
+	s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
+	if (IS_ERR(s)) {
+		if (PTR_ERR(s) != -EFAULT)
+			return PTR_ERR(s);
+		return 0;  /* Not found */
+	}
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return 1;  /* Found */
+}
+
 struct file_system_type nilfs_fs_type = {
 	.owner    = THIS_MODULE,
 	.name     = "nilfs2",
diff --git a/trunk/fs/nilfs2/the_nilfs.c b/trunk/fs/nilfs2/the_nilfs.c
index e4e5c78bcc93..a91f15b8673c 100644
--- a/trunk/fs/nilfs2/the_nilfs.c
+++ b/trunk/fs/nilfs2/the_nilfs.c
@@ -35,10 +35,6 @@
 #include "seglist.h"
 #include "segbuf.h"
 
-
-static LIST_HEAD(nilfs_objects);
-static DEFINE_SPINLOCK(nilfs_lock);
-
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
 			    sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -59,7 +55,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
  * Return Value: On success, pointer to the_nilfs is returned.
  * On error, NULL is returned.
  */
-static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
 	struct the_nilfs *nilfs;
 
@@ -72,10 +68,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_writer_refcount, -1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
-	init_rwsem(&nilfs->ns_super_sem);
-	mutex_init(&nilfs->ns_mount_mutex);
 	mutex_init(&nilfs->ns_writer_mutex);
-	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_gc_inodes_h = NULL;
@@ -84,45 +77,6 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	return nilfs;
 }
 
-/**
- * find_or_create_nilfs - find or create nilfs object
- * @bdev: block device to which the_nilfs is related
- *
- * find_nilfs() looks up an existent nilfs object created on the
- * device and gets the reference count of the object.  If no nilfs object
- * is found on the device, a new nilfs object is allocated.
- *
- * Return Value: On success, pointer to the nilfs object is returned.
- * On error, NULL is returned.
- */
-struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
-{
-	struct the_nilfs *nilfs, *new = NULL;
-
- retry:
-	spin_lock(&nilfs_lock);
-	list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
-		if (nilfs->ns_bdev == bdev) {
-			get_nilfs(nilfs);
-			spin_unlock(&nilfs_lock);
-			if (new)
-				put_nilfs(new);
-			return nilfs; /* existing object */
-		}
-	}
-	if (new) {
-		list_add_tail(&new->ns_list, &nilfs_objects);
-		spin_unlock(&nilfs_lock);
-		return new; /* new object */
-	}
-	spin_unlock(&nilfs_lock);
-
-	new = alloc_nilfs(bdev);
-	if (new)
-		goto retry;
-	return NULL; /* insufficient memory */
-}
-
 /**
  * put_nilfs - release a reference to the_nilfs
  * @nilfs: the_nilfs structure to be released
@@ -132,20 +86,13 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
  */
 void put_nilfs(struct the_nilfs *nilfs)
 {
-	spin_lock(&nilfs_lock);
-	if (!atomic_dec_and_test(&nilfs->ns_count)) {
-		spin_unlock(&nilfs_lock);
+	if (!atomic_dec_and_test(&nilfs->ns_count))
 		return;
-	}
-	list_del_init(&nilfs->ns_list);
-	spin_unlock(&nilfs_lock);
-
 	/*
-	 * Increment of ns_count never occurs below because the caller
+	 * Increment of ns_count never occur below because the caller
 	 * of get_nilfs() holds at least one reference to the_nilfs.
 	 * Thus its exclusion control is not required here.
 	 */
-
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
 		nilfs_mdt_clear(nilfs->ns_sufile);
@@ -666,63 +613,13 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
 	return ret;
 }
 
-/**
- * nilfs_find_sbinfo - find existing nilfs_sb_info structure
- * @nilfs: nilfs object
- * @rw_mount: mount type (non-zero value for read/write mount)
- * @cno: checkpoint number (zero for read-only mount)
- *
- * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
- * @rw_mount and @cno (in case of snapshots) matched.  If no instance
- * was found, NULL is returned.  Although the super block instance can
- * be unmounted after this function returns, the nilfs_sb_info struct
- * is kept on memory until nilfs_put_sbinfo() is called.
- */
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
-					int rw_mount, __u64 cno)
-{
-	struct nilfs_sb_info *sbi;
-
-	down_read(&nilfs->ns_super_sem);
-	/*
-	 * The SNAPSHOT flag and sb->s_flags are supposed to be
-	 * protected with nilfs->ns_super_sem.
-	 */
-	sbi = nilfs->ns_current;
-	if (rw_mount) {
-		if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
-			goto found; /* read/write mount */
-		else
-			goto out;
-	} else if (cno == 0) {
-		if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
-			goto found; /* read-only mount */
-		else
-			goto out;
-	}
-
-	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-		if (nilfs_test_opt(sbi, SNAPSHOT) &&
-		    sbi->s_snapshot_cno == cno)
-			goto found; /* snapshot mount */
-	}
- out:
-	up_read(&nilfs->ns_super_sem);
-	return NULL;
-
- found:
-	atomic_inc(&sbi->s_count);
-	up_read(&nilfs->ns_super_sem);
-	return sbi;
-}
-
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 				int snapshot_mount)
 {
 	struct nilfs_sb_info *sbi;
 	int ret = 0;
 
-	down_read(&nilfs->ns_super_sem);
+	down_read(&nilfs->ns_sem);
 	if (cno == 0 || cno > nilfs->ns_cno)
 		goto out_unlock;
 
@@ -739,6 +636,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 		ret++;
 
  out_unlock:
-	up_read(&nilfs->ns_super_sem);
+	up_read(&nilfs->ns_sem);
 	return ret;
 }
diff --git a/trunk/fs/nilfs2/the_nilfs.h b/trunk/fs/nilfs2/the_nilfs.h
index e8adbffc626f..30fe58778d05 100644
--- a/trunk/fs/nilfs2/the_nilfs.h
+++ b/trunk/fs/nilfs2/the_nilfs.h
@@ -43,16 +43,12 @@ enum {
  * struct the_nilfs - struct to supervise multiple nilfs mount points
  * @ns_flags: flags
  * @ns_count: reference count
- * @ns_list: list head for nilfs_list
  * @ns_bdev: block device
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
- * @ns_super_sem: semaphore for global operations across super block instances
- * @ns_mount_mutex: mutex protecting mount process of nilfs
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
- * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
  * @ns_sbwtime: previous write time of super blocks
@@ -92,23 +88,14 @@ enum {
 struct the_nilfs {
 	unsigned long		ns_flags;
 	atomic_t		ns_count;
-	struct list_head	ns_list;
 
 	struct block_device    *ns_bdev;
 	struct backing_dev_info *ns_bdi;
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
-	struct rw_semaphore	ns_super_sem;
-	struct mutex		ns_mount_mutex;
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
-	/*
-	 * components protected by ns_super_sem
-	 */
-	struct nilfs_sb_info   *ns_current;
-	struct list_head	ns_supers;
-
 	/*
 	 * used for
 	 * - loading the latest checkpoint exclusively.
@@ -121,6 +108,7 @@ struct the_nilfs {
 	time_t			ns_sbwtime[2];
 	unsigned		ns_sbsize;
 	unsigned		ns_mount_state;
+	struct list_head	ns_supers;
 
 	/*
 	 * Following fields are dedicated to a writable FS-instance.
@@ -203,12 +191,11 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_ALTSB_FREQ	60  /* spare superblock */
 
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *find_or_create_nilfs(struct block_device *);
+struct the_nilfs *alloc_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -251,12 +238,6 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	mutex_unlock(&nilfs->ns_writer_mutex);
 }
 
-static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
-{
-	if (!atomic_dec_and_test(&sbi->s_count))
-		kfree(sbi);
-}
-
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
 			sector_t *seg_start, sector_t *seg_end)
diff --git a/trunk/fs/notify/Kconfig b/trunk/fs/notify/Kconfig
index 31dac7e3b0f1..50914d7303c6 100644
--- a/trunk/fs/notify/Kconfig
+++ b/trunk/fs/notify/Kconfig
@@ -1,15 +1,2 @@
-config FSNOTIFY
-	bool "Filesystem notification backend"
-	default y
-	---help---
-	   fsnotify is a backend for filesystem notification.  fsnotify does
-	   not provide any userspace interface but does provide the basis
-	   needed for other notification schemes such as dnotify, inotify,
-	   and fanotify.
-
-	   Say Y here to enable fsnotify suport.
-
-	   If unsure, say Y.
-
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/trunk/fs/notify/Makefile b/trunk/fs/notify/Makefile
index 0922cc826c46..5a95b6010ce7 100644
--- a/trunk/fs/notify/Makefile
+++ b/trunk/fs/notify/Makefile
@@ -1,4 +1,2 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
-
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/trunk/fs/notify/dnotify/Kconfig b/trunk/fs/notify/dnotify/Kconfig
index 904ff8d5405a..26adf5dfa646 100644
--- a/trunk/fs/notify/dnotify/Kconfig
+++ b/trunk/fs/notify/dnotify/Kconfig
@@ -1,6 +1,5 @@
 config DNOTIFY
 	bool "Dnotify support"
-	depends on FSNOTIFY
 	default y
 	help
 	  Dnotify is a directory-based per-fd file change notification system
diff --git a/trunk/fs/notify/dnotify/dnotify.c b/trunk/fs/notify/dnotify/dnotify.c
index 828a889be909..b0aa2cde80bd 100644
--- a/trunk/fs/notify/dnotify/dnotify.c
+++ b/trunk/fs/notify/dnotify/dnotify.c
@@ -3,9 +3,6 @@
  *
  * Copyright (C) 2000,2001,2002 Stephen Rothwell
  *
- * Copyright (C) 2009 Eric Paris <Red Hat Inc>
- * dnotify was largly rewritten to use the new fsnotify infrastructure
- *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -24,173 +21,24 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
-#include <linux/fsnotify_backend.h>
 
 int dir_notify_enable __read_mostly = 1;
 
-static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
-static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
-
-/*
- * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
- * is being watched by dnotify.  If multiple userspace applications are watching
- * the same directory with dnotify their information is chained in dn
- */
-struct dnotify_mark_entry {
-	struct fsnotify_mark_entry fsn_entry;
-	struct dnotify_struct *dn;
-};
+static struct kmem_cache *dn_cache __read_mostly;
 
-/*
- * When a process starts or stops watching an inode the set of events which
- * dnotify cares about for that inode may change.  This function runs the
- * list of everything receiving dnotify events about this directory and calculates
- * the set of all those events.  After it updates what dnotify is interested in
- * it calls the fsnotify function so it can update the set of all events relevant
- * to this inode.
- */
-static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
+static void redo_inode_mask(struct inode *inode)
 {
-	__u32 new_mask, old_mask;
+	unsigned long new_mask;
 	struct dnotify_struct *dn;
-	struct dnotify_mark_entry *dnentry  = container_of(entry,
-							   struct dnotify_mark_entry,
-							   fsn_entry);
-
-	assert_spin_locked(&entry->lock);
 
-	old_mask = entry->mask;
 	new_mask = 0;
-	for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
-		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-	entry->mask = new_mask;
-
-	if (old_mask == new_mask)
-		return;
-
-	if (entry->inode)
-		fsnotify_recalc_inode_mask(entry->inode);
-}
-
-/*
- * Mains fsnotify call where events are delivered to dnotify.
- * Find the dnotify mark on the relevant inode, run the list of dnotify structs
- * on that mark and determine which of them has expressed interest in receiving
- * events of this type.  When found send the correct process and signal and
- * destroy the dnotify struct if it was not registered to receive multiple
- * events.
- */
-static int dnotify_handle_event(struct fsnotify_group *group,
-				struct fsnotify_event *event)
-{
-	struct fsnotify_mark_entry *entry = NULL;
-	struct dnotify_mark_entry *dnentry;
-	struct inode *to_tell;
-	struct dnotify_struct *dn;
-	struct dnotify_struct **prev;
-	struct fown_struct *fown;
-
-	to_tell = event->to_tell;
-
-	spin_lock(&to_tell->i_lock);
-	entry = fsnotify_find_mark_entry(group, to_tell);
-	spin_unlock(&to_tell->i_lock);
-
-	/* unlikely since we alreay passed dnotify_should_send_event() */
-	if (unlikely(!entry))
-		return 0;
-	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-
-	spin_lock(&entry->lock);
-	prev = &dnentry->dn;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_mask & event->mask) == 0) {
-			prev = &dn->dn_next;
-			continue;
-		}
-		fown = &dn->dn_filp->f_owner;
-		send_sigio(fown, dn->dn_fd, POLL_MSG);
-		if (dn->dn_mask & FS_DN_MULTISHOT)
-			prev = &dn->dn_next;
-		else {
-			*prev = dn->dn_next;
-			kmem_cache_free(dnotify_struct_cache, dn);
-			dnotify_recalc_inode_mask(entry);
-		}
-	}
-
-	spin_unlock(&entry->lock);
-	fsnotify_put_mark(entry);
-
-	return 0;
-}
-
-/*
- * Given an inode and mask determine if dnotify would be interested in sending
- * userspace notification for that pair.
- */
-static bool dnotify_should_send_event(struct fsnotify_group *group,
-				      struct inode *inode, __u32 mask)
-{
-	struct fsnotify_mark_entry *entry;
-	bool send;
-
-	/* !dir_notify_enable should never get here, don't waste time checking
-	if (!dir_notify_enable)
-		return 0; */
-
-	/* not a dir, dnotify doesn't care */
-	if (!S_ISDIR(inode->i_mode))
-		return false;
-
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
-	spin_unlock(&inode->i_lock);
-
-	/* no mark means no dnotify watch */
-	if (!entry)
-		return false;
-
-	mask = (mask & ~FS_EVENT_ON_CHILD);
-	send = (mask & entry->mask);
-
-	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
-
-	return send;
-}
-
-static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
-{
-	struct dnotify_mark_entry *dnentry = container_of(entry,
-							  struct dnotify_mark_entry,
-							  fsn_entry);
-
-	BUG_ON(dnentry->dn);
-
-	kmem_cache_free(dnotify_mark_entry_cache, dnentry);
+	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+	inode->i_dnotify_mask = new_mask;
 }
 
-static struct fsnotify_ops dnotify_fsnotify_ops = {
-	.handle_event = dnotify_handle_event,
-	.should_send_event = dnotify_should_send_event,
-	.free_group_priv = NULL,
-	.freeing_mark = NULL,
-	.free_event_priv = NULL,
-};
-
-/*
- * Called every time a file is closed.  Looks first for a dnotify mark on the
- * inode.  If one is found run all of the ->dn entries attached to that
- * mark for one relevant to this process closing the file and remove that
- * dnotify_struct.  If that was the last dnotify_struct also remove the
- * fsnotify_mark_entry.
- */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
-	struct fsnotify_mark_entry *entry;
-	struct dnotify_mark_entry *dnentry;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
@@ -198,243 +46,145 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	inode = filp->f_path.dentry->d_inode;
 	if (!S_ISDIR(inode->i_mode))
 		return;
-
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(dnotify_group, inode);
-	spin_unlock(&inode->i_lock);
-	if (!entry)
-		return;
-	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-
-	mutex_lock(&dnotify_mark_mutex);
-
-	spin_lock(&entry->lock);
-	prev = &dnentry->dn;
+	prev = &inode->i_dnotify;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
 			*prev = dn->dn_next;
-			kmem_cache_free(dnotify_struct_cache, dn);
-			dnotify_recalc_inode_mask(entry);
+			redo_inode_mask(inode);
+			kmem_cache_free(dn_cache, dn);
 			break;
 		}
 		prev = &dn->dn_next;
 	}
-
-	spin_unlock(&entry->lock);
-
-	/* nothing else could have found us thanks to the dnotify_mark_mutex */
-	if (dnentry->dn == NULL)
-		fsnotify_destroy_mark_by_entry(entry);
-
-	fsnotify_recalc_group_mask(dnotify_group);
-
-	mutex_unlock(&dnotify_mark_mutex);
-
-	fsnotify_put_mark(entry);
-}
-
-/* this conversion is done only at watch creation */
-static __u32 convert_arg(unsigned long arg)
-{
-	__u32 new_mask = FS_EVENT_ON_CHILD;
-
-	if (arg & DN_MULTISHOT)
-		new_mask |= FS_DN_MULTISHOT;
-	if (arg & DN_DELETE)
-		new_mask |= (FS_DELETE | FS_MOVED_FROM);
-	if (arg & DN_MODIFY)
-		new_mask |= FS_MODIFY;
-	if (arg & DN_ACCESS)
-		new_mask |= FS_ACCESS;
-	if (arg & DN_ATTRIB)
-		new_mask |= FS_ATTRIB;
-	if (arg & DN_RENAME)
-		new_mask |= FS_DN_RENAME;
-	if (arg & DN_CREATE)
-		new_mask |= (FS_CREATE | FS_MOVED_TO);
-
-	return new_mask;
-}
-
-/*
- * If multiple processes watch the same inode with dnotify there is only one
- * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
- * onto that mark.  This function either attaches the new dnotify_struct onto
- * that list, or it |= the mask onto an existing dnofiy_struct.
- */
-static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
-		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
-{
-	struct dnotify_struct *odn;
-
-	odn = dnentry->dn;
-	while (odn != NULL) {
-		/* adding more events to existing dnofiy_struct? */
-		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-			odn->dn_fd = fd;
-			odn->dn_mask |= mask;
-			return -EEXIST;
-		}
-		odn = odn->dn_next;
-	}
-
-	dn->dn_mask = mask;
-	dn->dn_fd = fd;
-	dn->dn_filp = filp;
-	dn->dn_owner = id;
-	dn->dn_next = dnentry->dn;
-	dnentry->dn = dn;
-
-	return 0;
+	spin_unlock(&inode->i_lock);
 }
 
-/*
- * When a process calls fcntl to attach a dnotify watch to a directory it ends
- * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
- * attached to the fsnotify_mark.
- */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
-	struct dnotify_mark_entry *new_dnentry, *dnentry;
-	struct fsnotify_mark_entry *new_entry, *entry;
 	struct dnotify_struct *dn;
+	struct dnotify_struct *odn;
+	struct dnotify_struct **prev;
 	struct inode *inode;
 	fl_owner_t id = current->files;
 	struct file *f;
-	int destroy = 0, error = 0;
-	__u32 mask;
-
-	/* we use these to tell if we need to kfree */
-	new_entry = NULL;
-	dn = NULL;
-
-	if (!dir_notify_enable) {
-		error = -EINVAL;
-		goto out_err;
-	}
+	int error = 0;
 
-	/* a 0 mask means we are explicitly removing the watch */
 	if ((arg & ~DN_MULTISHOT) == 0) {
 		dnotify_flush(filp, id);
-		error = 0;
-		goto out_err;
+		return 0;
 	}
-
-	/* dnotify only works on directories */
+	if (!dir_notify_enable)
+		return -EINVAL;
 	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode)) {
-		error = -ENOTDIR;
-		goto out_err;
-	}
-
-	/* expect most fcntl to add new rather than augment old */
-	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
-	if (!dn) {
-		error = -ENOMEM;
-		goto out_err;
-	}
-
-	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
-	new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
-	if (!new_dnentry) {
-		error = -ENOMEM;
-		goto out_err;
-	}
-
-	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-	mask = convert_arg(arg);
-
-	/* set up the new_entry and new_dnentry */
-	new_entry = &new_dnentry->fsn_entry;
-	fsnotify_init_mark(new_entry, dnotify_free_mark);
-	new_entry->mask = mask;
-	new_dnentry->dn = NULL;
-
-	/* this is needed to prevent the fcntl/close race described below */
-	mutex_lock(&dnotify_mark_mutex);
-
-	/* add the new_entry or find an old one. */
+	if (!S_ISDIR(inode->i_mode))
+		return -ENOTDIR;
+	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+	if (dn == NULL)
+		return -ENOMEM;
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(dnotify_group, inode);
-	spin_unlock(&inode->i_lock);
-	if (entry) {
-		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-		spin_lock(&entry->lock);
-	} else {
-		fsnotify_add_mark(new_entry, dnotify_group, inode);
-		spin_lock(&new_entry->lock);
-		entry = new_entry;
-		dnentry = new_dnentry;
-		/* we used new_entry, so don't free it */
-		new_entry = NULL;
+	prev = &inode->i_dnotify;
+	while ((odn = *prev) != NULL) {
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= arg;
+			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
+			goto out_free;
+		}
+		prev = &odn->dn_next;
 	}
 
 	rcu_read_lock();
 	f = fcheck(fd);
 	rcu_read_unlock();
-
-	/* if (f != filp) means that we lost a race and another task/thread
-	 * actually closed the fd we are still playing with before we grabbed
-	 * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
-	 * only time we clean up the mark entries we need to get our mark off
-	 * the list. */
-	if (f != filp) {
-		/* if we added ourselves, shoot ourselves, it's possible that
-		 * the flush actually did shoot this entry.  That's fine too
-		 * since multiple calls to destroy_mark is perfectly safe, if
-		 * we found a dnentry already attached to the inode, just sod
-		 * off silently as the flush at close time dealt with it.
-		 */
-		if (dnentry == new_dnentry)
-			destroy = 1;
-		goto out;
-	}
+	/* we'd lost the race with close(), sod off silently */
+	/* note that inode->i_lock prevents reordering problems
+	 * between accesses to descriptor table and ->i_dnotify */
+	if (f != filp)
+		goto out_free;
 
 	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	if (error) {
-		/* if we added, we must shoot */
-		if (dnentry == new_dnentry)
-			destroy = 1;
-		goto out;
-	}
+	if (error)
+		goto out_free;
 
-	error = attach_dn(dn, dnentry, id, fd, filp, mask);
-	/* !error means that we attached the dn to the dnentry, so don't free it */
-	if (!error)
-		dn = NULL;
-	/* -EEXIST means that we didn't add this new dn and used an old one.
-	 * that isn't an error (and the unused dn should be freed) */
-	else if (error == -EEXIST)
-		error = 0;
+	dn->dn_mask = arg;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
+	dn->dn_next = inode->i_dnotify;
+	inode->i_dnotify = dn;
+	spin_unlock(&inode->i_lock);
+	return 0;
 
-	dnotify_recalc_inode_mask(entry);
-out:
-	spin_unlock(&entry->lock);
+out_free:
+	spin_unlock(&inode->i_lock);
+	kmem_cache_free(dn_cache, dn);
+	return error;
+}
 
-	if (destroy)
-		fsnotify_destroy_mark_by_entry(entry);
+void __inode_dir_notify(struct inode *inode, unsigned long event)
+{
+	struct dnotify_struct *	dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *	fown;
+	int			changed = 0;
 
-	fsnotify_recalc_group_mask(dnotify_group);
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & event) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			changed = 1;
+			kmem_cache_free(dn_cache, dn);
+		}
+	}
+	if (changed)
+		redo_inode_mask(inode);
+	spin_unlock(&inode->i_lock);
+}
 
-	mutex_unlock(&dnotify_mark_mutex);
-	fsnotify_put_mark(entry);
-out_err:
-	if (new_entry)
-		fsnotify_put_mark(new_entry);
-	if (dn)
-		kmem_cache_free(dnotify_struct_cache, dn);
-	return error;
+EXPORT_SYMBOL(__inode_dir_notify);
+
+/*
+ * This is hopelessly wrong, but unfixable without API changes.  At
+ * least it doesn't oops the kernel...
+ *
+ * To safely access ->d_parent we need to keep d_move away from it.  Use the
+ * dentry's d_lock for this.
+ */
+void dnotify_parent(struct dentry *dentry, unsigned long event)
+{
+	struct dentry *parent;
+
+	if (!dir_notify_enable)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	if (parent->d_inode->i_dnotify_mask & event) {
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		__inode_dir_notify(parent->d_inode, event);
+		dput(parent);
+	} else {
+		spin_unlock(&dentry->d_lock);
+	}
 }
+EXPORT_SYMBOL_GPL(dnotify_parent);
 
 static int __init dnotify_init(void)
 {
-	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
-
-	dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
-					      0, &dnotify_fsnotify_ops);
-	if (IS_ERR(dnotify_group))
-		panic("unable to allocate fsnotify group for dnotify\n");
+	dn_cache = kmem_cache_create("dnotify_cache",
+		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/trunk/fs/notify/fsnotify.c b/trunk/fs/notify/fsnotify.c
deleted file mode 100644
index ec2f7bd76818..000000000000
--- a/trunk/fs/notify/fsnotify.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/dcache.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/srcu.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-/*
- * Clear all of the marks on an inode when it is being evicted from core
- */
-void __fsnotify_inode_delete(struct inode *inode)
-{
-	fsnotify_clear_marks_by_inode(inode);
-}
-EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
-
-/*
- * Given an inode, first check if we care what happens to our children.  Inotify
- * and dnotify both tell their parents about events.  If we care about any event
- * on a child we run all of our children and set a dentry flag saying that the
- * parent cares.  Thus when an event happens on a child it can quickly tell if
- * if there is a need to find a parent and send the event to the parent.
- */
-void __fsnotify_update_child_dentry_flags(struct inode *inode)
-{
-	struct dentry *alias;
-	int watched;
-
-	if (!S_ISDIR(inode->i_mode))
-		return;
-
-	/* determine if the children should tell inode about their events */
-	watched = fsnotify_inode_watches_children(inode);
-
-	spin_lock(&dcache_lock);
-	/* run all of the dentries associated with this inode.  Since this is a
-	 * directory, there damn well better only be one item on this list */
-	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-		struct dentry *child;
-
-		/* run all of the children of the original inode and fix their
-		 * d_flags to indicate parental interest (their parent is the
-		 * original inode) */
-		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
-			if (!child->d_inode)
-				continue;
-
-			spin_lock(&child->d_lock);
-			if (watched)
-				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
-			else
-				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
-			spin_unlock(&child->d_lock);
-		}
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct dentry *dentry, __u32 mask)
-{
-	struct dentry *parent;
-	struct inode *p_inode;
-	bool send = false;
-	bool should_update_children = false;
-
-	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	p_inode = parent->d_inode;
-
-	if (fsnotify_inode_watches_children(p_inode)) {
-		if (p_inode->i_fsnotify_mask & mask) {
-			dget(parent);
-			send = true;
-		}
-	} else {
-		/*
-		 * The parent doesn't care about events on it's children but
-		 * at least one child thought it did.  We need to run all the
-		 * children and update their d_flags to let them know p_inode
-		 * doesn't care about them any more.
-		 */
-		dget(parent);
-		should_update_children = true;
-	}
-
-	spin_unlock(&dentry->d_lock);
-
-	if (send) {
-		/* we are notifying a parent so come up with the new mask which
-		 * specifies these are events which came from a child. */
-		mask |= FS_EVENT_ON_CHILD;
-
-		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-			 dentry->d_name.name, 0);
-		dput(parent);
-	}
-
-	if (unlikely(should_update_children)) {
-		__fsnotify_update_child_dentry_flags(p_inode);
-		dput(parent);
-	}
-}
-EXPORT_SYMBOL_GPL(__fsnotify_parent);
-
-/*
- * This is the main call to fsnotify.  The VFS calls into hook specific functions
- * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
- * out to all of the registered fsnotify_group.  Those groups can then use the
- * notification event in whatever means they feel necessary.
- */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
-{
-	struct fsnotify_group *group;
-	struct fsnotify_event *event = NULL;
-	int idx;
-	/* global tests shouldn't care about events on child only the specific event */
-	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
-
-	if (list_empty(&fsnotify_groups))
-		return;
-
-	if (!(test_mask & fsnotify_mask))
-		return;
-
-	if (!(test_mask & to_tell->i_fsnotify_mask))
-		return;
-	/*
-	 * SRCU!!  the groups list is very very much read only and the path is
-	 * very hot.  The VAST majority of events are not going to need to do
-	 * anything other than walk the list so it's crazy to pre-allocate.
-	 */
-	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
-		if (test_mask & group->mask) {
-			if (!group->ops->should_send_event(group, to_tell, mask))
-				continue;
-			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
-				/* shit, we OOM'd and now we can't tell, maybe
-				 * someday someone else will want to do something
-				 * here */
-				if (!event)
-					break;
-			}
-			group->ops->handle_event(group, event);
-		}
-	}
-	srcu_read_unlock(&fsnotify_grp_srcu, idx);
-	/*
-	 * fsnotify_create_event() took a reference so the event can't be cleaned
-	 * up while we are still trying to add it to lists, drop that one.
-	 */
-	if (event)
-		fsnotify_put_event(event);
-}
-EXPORT_SYMBOL_GPL(fsnotify);
-
-static __init int fsnotify_init(void)
-{
-	return init_srcu_struct(&fsnotify_grp_srcu);
-}
-subsys_initcall(fsnotify_init);
diff --git a/trunk/fs/notify/fsnotify.h b/trunk/fs/notify/fsnotify.h
deleted file mode 100644
index 4dc240824b2d..000000000000
--- a/trunk/fs/notify/fsnotify.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __FS_NOTIFY_FSNOTIFY_H_
-#define __FS_NOTIFY_FSNOTIFY_H_
-
-#include <linux/list.h>
-#include <linux/fsnotify.h>
-#include <linux/srcu.h>
-#include <linux/types.h>
-
-/* protects reads of fsnotify_groups */
-extern struct srcu_struct fsnotify_grp_srcu;
-/* all groups which receive fsnotify events */
-extern struct list_head fsnotify_groups;
-/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
-extern __u32 fsnotify_mask;
-
-/* destroy all events sitting in this groups notification queue */
-extern void fsnotify_flush_notify(struct fsnotify_group *group);
-
-/* final kfree of a group */
-extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
-
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/*
- * update the dentry->d_flags of all of inode's children to indicate if inode cares
- * about events that happen to its children.
- */
-extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
-
-/* allocate and destroy and event holder to attach events to notification/access queues */
-extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
-extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
-
-#endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/trunk/fs/notify/group.c b/trunk/fs/notify/group.c
deleted file mode 100644
index 0e1677144bc5..000000000000
--- a/trunk/fs/notify/group.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/srcu.h>
-#include <linux/rculist.h>
-#include <linux/wait.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-#include <asm/atomic.h>
-
-/* protects writes to fsnotify_groups and fsnotify_mask */
-static DEFINE_MUTEX(fsnotify_grp_mutex);
-/* protects reads while running the fsnotify_groups list */
-struct srcu_struct fsnotify_grp_srcu;
-/* all groups registered to receive filesystem notifications */
-LIST_HEAD(fsnotify_groups);
-/* bitwise OR of all events (FS_*) interesting to some group on this system */
-__u32 fsnotify_mask;
-
-/*
- * When a new group registers or changes it's set of interesting events
- * this function updates the fsnotify_mask to contain all interesting events
- */
-void fsnotify_recalc_global_mask(void)
-{
-	struct fsnotify_group *group;
-	__u32 mask = 0;
-	int idx;
-
-	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
-		mask |= group->mask;
-	srcu_read_unlock(&fsnotify_grp_srcu, idx);
-	fsnotify_mask = mask;
-}
-
-/*
- * Update the group->mask by running all of the marks associated with this
- * group and finding the bitwise | of all of the mark->mask.  If we change
- * the group->mask we need to update the global mask of events interesting
- * to the system.
- */
-void fsnotify_recalc_group_mask(struct fsnotify_group *group)
-{
-	__u32 mask = 0;
-	__u32 old_mask = group->mask;
-	struct fsnotify_mark_entry *entry;
-
-	spin_lock(&group->mark_lock);
-	list_for_each_entry(entry, &group->mark_entries, g_list)
-		mask |= entry->mask;
-	spin_unlock(&group->mark_lock);
-
-	group->mask = mask;
-
-	if (old_mask != mask)
-		fsnotify_recalc_global_mask();
-}
-
-/*
- * Take a reference to a group so things found under the fsnotify_grp_mutex
- * can't get freed under us
- */
-static void fsnotify_get_group(struct fsnotify_group *group)
-{
-	atomic_inc(&group->refcnt);
-}
-
-/*
- * Final freeing of a group
- */
-void fsnotify_final_destroy_group(struct fsnotify_group *group)
-{
-	/* clear the notification queue of all events */
-	fsnotify_flush_notify(group);
-
-	if (group->ops->free_group_priv)
-		group->ops->free_group_priv(group);
-
-	kfree(group);
-}
-
-/*
- * Trying to get rid of a group.  We need to first get rid of any outstanding
- * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
- * could miss marks that are being freed by inode and those marks could still
- * hold a reference to this group (via group->num_marks)  If we get into that
- * situtation, the fsnotify_final_destroy_group will get called when that final
- * mark is freed.
- */
-static void fsnotify_destroy_group(struct fsnotify_group *group)
-{
-	/* clear all inode mark entries for this group */
-	fsnotify_clear_marks_by_group(group);
-
-	/* past the point of no return, matches the initial value of 1 */
-	if (atomic_dec_and_test(&group->num_marks))
-		fsnotify_final_destroy_group(group);
-}
-
-/*
- * Remove this group from the global list of groups that will get events
- * this can be done even if there are still references and things still using
- * this group.  This just stops the group from getting new events.
- */
-static void __fsnotify_evict_group(struct fsnotify_group *group)
-{
-	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-
-	if (group->on_group_list)
-		list_del_rcu(&group->group_list);
-	group->on_group_list = 0;
-}
-
-/*
- * Called when a group is no longer interested in getting events.  This can be
- * used if a group is misbehaving or if for some reason a group should no longer
- * get any filesystem events.
- */
-void fsnotify_evict_group(struct fsnotify_group *group)
-{
-	mutex_lock(&fsnotify_grp_mutex);
-	__fsnotify_evict_group(group);
-	mutex_unlock(&fsnotify_grp_mutex);
-}
-
-/*
- * Drop a reference to a group.  Free it if it's through.
- */
-void fsnotify_put_group(struct fsnotify_group *group)
-{
-	if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
-		return;
-
-	/*
-	 * OK, now we know that there's no other users *and* we hold mutex,
-	 * so no new references will appear
-	 */
-	__fsnotify_evict_group(group);
-
-	/*
-	 * now it's off the list, so the only thing we might care about is
-	 * srcu access....
-	 */
-	mutex_unlock(&fsnotify_grp_mutex);
-	synchronize_srcu(&fsnotify_grp_srcu);
-
-	/* and now it is really dead. _Nothing_ could be seeing it */
-	fsnotify_recalc_global_mask();
-	fsnotify_destroy_group(group);
-}
-
-/*
- * Simply run the fsnotify_groups list and find a group which matches
- * the given parameters.  If a group is found we take a reference to that
- * group.
- */
-static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
-						  const struct fsnotify_ops *ops)
-{
-	struct fsnotify_group *group_iter;
-	struct fsnotify_group *group = NULL;
-
-	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-
-	list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
-		if (group_iter->group_num == group_num) {
-			if ((group_iter->mask == mask) &&
-			    (group_iter->ops == ops)) {
-				fsnotify_get_group(group_iter);
-				group = group_iter;
-			} else
-				group = ERR_PTR(-EEXIST);
-		}
-	}
-	return group;
-}
-
-/*
- * Either finds an existing group which matches the group_num, mask, and ops or
- * creates a new group and adds it to the global group list.  In either case we
- * take a reference for the group returned.
- */
-struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
-					     const struct fsnotify_ops *ops)
-{
-	struct fsnotify_group *group, *tgroup;
-
-	/* very low use, simpler locking if we just always alloc */
-	group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
-	if (!group)
-		return ERR_PTR(-ENOMEM);
-
-	atomic_set(&group->refcnt, 1);
-
-	group->on_group_list = 0;
-	group->group_num = group_num;
-	group->mask = mask;
-
-	mutex_init(&group->notification_mutex);
-	INIT_LIST_HEAD(&group->notification_list);
-	init_waitqueue_head(&group->notification_waitq);
-	group->q_len = 0;
-	group->max_events = UINT_MAX;
-
-	spin_lock_init(&group->mark_lock);
-	atomic_set(&group->num_marks, 0);
-	INIT_LIST_HEAD(&group->mark_entries);
-
-	group->ops = ops;
-
-	mutex_lock(&fsnotify_grp_mutex);
-	tgroup = fsnotify_find_group(group_num, mask, ops);
-	if (tgroup) {
-		/* group already exists */
-		mutex_unlock(&fsnotify_grp_mutex);
-		/* destroy the new one we made */
-		fsnotify_put_group(group);
-		return tgroup;
-	}
-
-	/* group not found, add a new one */
-	list_add_rcu(&group->group_list, &fsnotify_groups);
-	group->on_group_list = 1;
-	/* being on the fsnotify_groups list holds one num_marks */
-	atomic_inc(&group->num_marks);
-
-	mutex_unlock(&fsnotify_grp_mutex);
-
-	if (mask)
-		fsnotify_recalc_global_mask();
-
-	return group;
-}
diff --git a/trunk/fs/notify/inode_mark.c b/trunk/fs/notify/inode_mark.c
deleted file mode 100644
index c8a07c65482b..000000000000
--- a/trunk/fs/notify/inode_mark.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * fsnotify inode mark locking/lifetime/and refcnting
- *
- * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object.  The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guarenteed to survive until the reference is dropped.
- *
- * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
- *
- * entry->lock
- * group->mark_lock
- * inode->i_lock
- *
- * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the mark_entries list anchored inside a given group
- * and each entry is hooked via the g_list.  It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
- *
- * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
- * given inode and each entry is hooked via the i_list. (and sorta the
- * free_i_list)
- *
- *
- * LIFETIME:
- * Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
- *
- * The inode mark can be cleared for a number of different reasons including:
- * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
- * - The inode is being evicted from cache. (fsnotify_inode_delete)
- * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
- * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
- * - The fsnotify_group associated with the mark is going away and all such marks
- *   need to be cleaned up. (fsnotify_clear_marks_by_group)
- *
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
- * Very similarly for freeing by group, except we use free_g_list.
- *
- * This has the very interesting property of being able to run concurrently with
- * any (or all) other directions.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
-
-#include <asm/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
-{
-	atomic_inc(&entry->refcnt);
-}
-
-void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
-{
-	if (atomic_dec_and_test(&entry->refcnt))
-		entry->free_mark(entry);
-}
-
-/*
- * Recalculate the mask of events relevant to a given inode locked.
- */
-static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
-{
-	struct fsnotify_mark_entry *entry;
-	struct hlist_node *pos;
-	__u32 new_mask = 0;
-
-	assert_spin_locked(&inode->i_lock);
-
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
-		new_mask |= entry->mask;
-	inode->i_fsnotify_mask = new_mask;
-}
-
-/*
- * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this inode.
- */
-void fsnotify_recalc_inode_mask(struct inode *inode)
-{
-	spin_lock(&inode->i_lock);
-	fsnotify_recalc_inode_mask_locked(inode);
-	spin_unlock(&inode->i_lock);
-
-	__fsnotify_update_child_dentry_flags(inode);
-}
-
-/*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the entry->lock
- */
-void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
-{
-	struct fsnotify_group *group;
-	struct inode *inode;
-
-	spin_lock(&entry->lock);
-
-	group = entry->group;
-	inode = entry->inode;
-
-	BUG_ON(group && !inode);
-	BUG_ON(!group && inode);
-
-	/* if !group something else already marked this to die */
-	if (!group) {
-		spin_unlock(&entry->lock);
-		return;
-	}
-
-	/* 1 from caller and 1 for being on i_list/g_list */
-	BUG_ON(atomic_read(&entry->refcnt) < 2);
-
-	spin_lock(&group->mark_lock);
-	spin_lock(&inode->i_lock);
-
-	hlist_del_init(&entry->i_list);
-	entry->inode = NULL;
-
-	list_del_init(&entry->g_list);
-	entry->group = NULL;
-
-	fsnotify_put_mark(entry); /* for i_list and g_list */
-
-	/*
-	 * this mark is now off the inode->i_fsnotify_mark_entries list and we
-	 * hold the inode->i_lock, so this is the perfect time to update the
-	 * inode->i_fsnotify_mask
-	 */
-	fsnotify_recalc_inode_mask_locked(inode);
-
-	spin_unlock(&inode->i_lock);
-	spin_unlock(&group->mark_lock);
-	spin_unlock(&entry->lock);
-
-	/*
-	 * Some groups like to know that marks are being freed.  This is a
-	 * callback to the group function to let it know that this entry
-	 * is being freed.
-	 */
-	if (group->ops->freeing_mark)
-		group->ops->freeing_mark(entry, group);
-
-	/*
-	 * __fsnotify_update_child_dentry_flags(inode);
-	 *
-	 * I really want to call that, but we can't, we have no idea if the inode
-	 * still exists the second we drop the entry->lock.
-	 *
-	 * The next time an event arrive to this inode from one of it's children
-	 * __fsnotify_parent will see that the inode doesn't care about it's
-	 * children and will update all of these flags then.  So really this
-	 * is just a lazy update (and could be a perf win...)
-	 */
-
-
-	iput(inode);
-
-	/*
-	 * it's possible that this group tried to destroy itself, but this
-	 * this mark was simultaneously being freed by inode.  If that's the
-	 * case, we finish freeing the group here.
-	 */
-	if (unlikely(atomic_dec_and_test(&group->num_marks)))
-		fsnotify_final_destroy_group(group);
-}
-
-/*
- * Given a group, destroy all of the marks associated with that group.
- */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
-{
-	struct fsnotify_mark_entry *lentry, *entry;
-	LIST_HEAD(free_list);
-
-	spin_lock(&group->mark_lock);
-	list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
-		list_add(&entry->free_g_list, &free_list);
-		list_del_init(&entry->g_list);
-		fsnotify_get_mark(entry);
-	}
-	spin_unlock(&group->mark_lock);
-
-	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
-	}
-}
-
-/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
-	struct fsnotify_mark_entry *entry, *lentry;
-	struct hlist_node *pos, *n;
-	LIST_HEAD(free_list);
-
-	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
-		list_add(&entry->free_i_list, &free_list);
-		hlist_del_init(&entry->i_list);
-		fsnotify_get_mark(entry);
-	}
-	spin_unlock(&inode->i_lock);
-
-	list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
-	}
-}
-
-/*
- * given a group and inode, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
-						     struct inode *inode)
-{
-	struct fsnotify_mark_entry *entry;
-	struct hlist_node *pos;
-
-	assert_spin_locked(&inode->i_lock);
-
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
-		if (entry->group == group) {
-			fsnotify_get_mark(entry);
-			return entry;
-		}
-	}
-	return NULL;
-}
-
-/*
- * Nothing fancy, just initialize lists and locks and counters.
- */
-void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
-			void (*free_mark)(struct fsnotify_mark_entry *entry))
-
-{
-	spin_lock_init(&entry->lock);
-	atomic_set(&entry->refcnt, 1);
-	INIT_HLIST_NODE(&entry->i_list);
-	entry->group = NULL;
-	entry->mask = 0;
-	entry->inode = NULL;
-	entry->free_mark = free_mark;
-}
-
-/*
- * Attach an initialized mark entry to a given group and inode.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which group and for which inodes.
- */
-int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
-		      struct fsnotify_group *group, struct inode *inode)
-{
-	struct fsnotify_mark_entry *lentry;
-	int ret = 0;
-
-	inode = igrab(inode);
-	if (unlikely(!inode))
-		return -EINVAL;
-
-	/*
-	 * LOCKING ORDER!!!!
-	 * entry->lock
-	 * group->mark_lock
-	 * inode->i_lock
-	 */
-	spin_lock(&entry->lock);
-	spin_lock(&group->mark_lock);
-	spin_lock(&inode->i_lock);
-
-	entry->group = group;
-	entry->inode = inode;
-
-	lentry = fsnotify_find_mark_entry(group, inode);
-	if (!lentry) {
-		hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
-		list_add(&entry->g_list, &group->mark_entries);
-
-		fsnotify_get_mark(entry); /* for i_list and g_list */
-
-		atomic_inc(&group->num_marks);
-
-		fsnotify_recalc_inode_mask_locked(inode);
-	}
-
-	spin_unlock(&inode->i_lock);
-	spin_unlock(&group->mark_lock);
-	spin_unlock(&entry->lock);
-
-	if (lentry) {
-		ret = -EEXIST;
-		iput(inode);
-		fsnotify_put_mark(lentry);
-	} else {
-		__fsnotify_update_child_dentry_flags(inode);
-	}
-
-	return ret;
-}
-
-/**
- * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
- *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
- */
-void fsnotify_unmount_inodes(struct list_head *list)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
-		struct inode *need_iput_tmp;
-
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count))
-			continue;
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-
-		/* In case fsnotify_inode_delete() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
-
-		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
-		}
-
-		/*
-		 * We can safely drop inode_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
-		 */
-		spin_unlock(&inode_lock);
-
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
-
-		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-
-		fsnotify_inode_delete(inode);
-
-		iput(inode);
-
-		spin_lock(&inode_lock);
-	}
-}
diff --git a/trunk/fs/notify/inotify/Kconfig b/trunk/fs/notify/inotify/Kconfig
index 5356884289a1..446792841023 100644
--- a/trunk/fs/notify/inotify/Kconfig
+++ b/trunk/fs/notify/inotify/Kconfig
@@ -1,30 +1,26 @@
 config INOTIFY
 	bool "Inotify file change notification support"
-	default n
+	default y
 	---help---
-	  Say Y here to enable legacy in kernel inotify support.  Inotify is a
-	  file change notification system.  It is a replacement for dnotify.
-	  This option only provides the legacy inotify in kernel API.  There
-	  are no in tree kernel users of this interface since it is deprecated.
-	  You only need this if you are loading an out of tree kernel module
-	  that uses inotify.
+	  Say Y here to enable inotify support.  Inotify is a file change
+	  notification system and a replacement for dnotify.  Inotify fixes
+	  numerous shortcomings in dnotify and introduces several new features
+	  including multiple file events, one-shot support, and unmount
+	  notification.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
-	  If unsure, say N.
+	  If unsure, say Y.
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	depends on FSNOTIFY
+	depends on INOTIFY
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
 	  associated system calls.  Inotify allows monitoring of both files and
 	  directories via a single open fd.  Events are read from the file
 	  descriptor, which is also select()- and poll()-able.
-	  Inotify fixes numerous shortcomings in dnotify and introduces several
-	  new features including multiple file events, one-shot support, and
-	  unmount notification.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
diff --git a/trunk/fs/notify/inotify/Makefile b/trunk/fs/notify/inotify/Makefile
index 943828171362..e290f3bb9d8d 100644
--- a/trunk/fs/notify/inotify/Makefile
+++ b/trunk/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)		+= inotify.o
-obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
diff --git a/trunk/fs/notify/inotify/inotify.c b/trunk/fs/notify/inotify/inotify.c
index 40b1cf914ccb..220c13f0d73d 100644
--- a/trunk/fs/notify/inotify/inotify.c
+++ b/trunk/fs/notify/inotify/inotify.c
@@ -32,7 +32,6 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
-#include <linux/fsnotify_backend.h>
 
 static atomic_t inotify_cookie;
 
@@ -906,25 +905,6 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
  */
 static int __init inotify_setup(void)
 {
-	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
-	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
-	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
-	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
-	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
-	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
-	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
-	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
-	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
-	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
-	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
-	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
-
-	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
-	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
-	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
-	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
-
 	atomic_set(&inotify_cookie, 0);
 
 	return 0;
diff --git a/trunk/fs/notify/inotify/inotify.h b/trunk/fs/notify/inotify/inotify.h
deleted file mode 100644
index ea2605a58b8a..000000000000
--- a/trunk/fs/notify/inotify/inotify.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <linux/fsnotify_backend.h>
-#include <linux/inotify.h>
-#include <linux/slab.h> /* struct kmem_cache */
-
-extern struct kmem_cache *event_priv_cachep;
-
-struct inotify_event_private_data {
-	struct fsnotify_event_private_data fsnotify_event_priv_data;
-	int wd;
-};
-
-struct inotify_inode_mark_entry {
-	/* fsnotify_mark_entry MUST be the first thing */
-	struct fsnotify_mark_entry fsn_entry;
-	int wd;
-};
-
-extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
-extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
-
-extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/trunk/fs/notify/inotify/inotify_fsnotify.c b/trunk/fs/notify/inotify/inotify_fsnotify.c
deleted file mode 100644
index 7ef75b83247e..000000000000
--- a/trunk/fs/notify/inotify/inotify_fsnotify.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * fs/inotify_user.c - inotify support for userspace
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * Copyright (C) 2009 Eric Paris <Red Hat Inc>
- * inotify was largely rewriten to make use of the fsnotify infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/fs.h> /* struct inode */
-#include <linux/fsnotify_backend.h>
-#include <linux/inotify.h>
-#include <linux/path.h> /* struct path */
-#include <linux/slab.h> /* kmem_* */
-#include <linux/types.h>
-
-#include "inotify.h"
-
-static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
-	struct fsnotify_mark_entry *entry;
-	struct inotify_inode_mark_entry *ientry;
-	struct inode *to_tell;
-	struct inotify_event_private_data *event_priv;
-	struct fsnotify_event_private_data *fsn_event_priv;
-	int wd, ret;
-
-	to_tell = event->to_tell;
-
-	spin_lock(&to_tell->i_lock);
-	entry = fsnotify_find_mark_entry(group, to_tell);
-	spin_unlock(&to_tell->i_lock);
-	/* race with watch removal?  We already passes should_send */
-	if (unlikely(!entry))
-		return 0;
-	ientry = container_of(entry, struct inotify_inode_mark_entry,
-			      fsn_entry);
-	wd = ientry->wd;
-
-	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
-	if (unlikely(!event_priv))
-		return -ENOMEM;
-
-	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
-	fsn_event_priv->group = group;
-	event_priv->wd = wd;
-
-	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
-	/* EEXIST is not an error */
-	if (ret == -EEXIST)
-		ret = 0;
-
-	/* did event_priv get attached? */
-	if (list_empty(&fsn_event_priv->event_list))
-		inotify_free_event_priv(fsn_event_priv);
-
-	/*
-	 * If we hold the entry until after the event is on the queue
-	 * IN_IGNORED won't be able to pass this event in the queue
-	 */
-	fsnotify_put_mark(entry);
-
-	return ret;
-}
-
-static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
-{
-	inotify_destroy_mark_entry(entry, group);
-}
-
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
-{
-	struct fsnotify_mark_entry *entry;
-	bool send;
-
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
-	spin_unlock(&inode->i_lock);
-	if (!entry)
-		return false;
-
-	mask = (mask & ~FS_EVENT_ON_CHILD);
-	send = (entry->mask & mask);
-
-	/* find took a reference */
-	fsnotify_put_mark(entry);
-
-	return send;
-}
-
-static int idr_callback(int id, void *p, void *data)
-{
-	BUG();
-	return 0;
-}
-
-static void inotify_free_group_priv(struct fsnotify_group *group)
-{
-	/* ideally the idr is empty and we won't hit the BUG in teh callback */
-	idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
-	idr_remove_all(&group->inotify_data.idr);
-	idr_destroy(&group->inotify_data.idr);
-}
-
-void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
-{
-	struct inotify_event_private_data *event_priv;
-
-
-	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
-				  fsnotify_event_priv_data);
-
-	kmem_cache_free(event_priv_cachep, event_priv);
-}
-
-const struct fsnotify_ops inotify_fsnotify_ops = {
-	.handle_event = inotify_handle_event,
-	.should_send_event = inotify_should_send_event,
-	.free_group_priv = inotify_free_group_priv,
-	.free_event_priv = inotify_free_event_priv,
-	.freeing_mark = inotify_freeing_mark,
-};
diff --git a/trunk/fs/notify/inotify/inotify_user.c b/trunk/fs/notify/inotify/inotify_user.c
index 982a412ac5bc..1634319e2404 100644
--- a/trunk/fs/notify/inotify/inotify_user.c
+++ b/trunk/fs/notify/inotify/inotify_user.c
@@ -8,9 +8,6 @@
  * Copyright (C) 2005 John McCutchan
  * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
- * Copyright (C) 2009 Eric Paris <Red Hat Inc>
- * inotify was largely rewriten to make use of the fsnotify infrastructure
- *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -22,48 +19,94 @@
  * General Public License for more details.
  */
 
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/fs.h> /* struct inode */
-#include <linux/fsnotify_backend.h>
-#include <linux/idr.h>
-#include <linux/init.h> /* module_init */
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
 #include <linux/inotify.h>
-#include <linux/kernel.h> /* roundup() */
-#include <linux/magic.h> /* superblock magic number */
-#include <linux/mount.h> /* mntget */
-#include <linux/namei.h> /* LOOKUP_FOLLOW */
-#include <linux/path.h> /* struct path */
-#include <linux/sched.h> /* struct user */
-#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/poll.h>
-#include <linux/wait.h>
-
-#include "inotify.h"
+#include <linux/magic.h>
 
 #include <asm/ioctls.h>
 
-static struct vfsmount *inotify_mnt __read_mostly;
+static struct kmem_cache *watch_cachep __read_mostly;
+static struct kmem_cache *event_cachep __read_mostly;
 
-/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
-static struct inotify_event nul_inotify_event;
+static struct vfsmount *inotify_mnt __read_mostly;
 
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
+static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
-int inotify_max_user_watches __read_mostly;
 
-static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
-struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ * 	inode->inotify_mutex (protects inode's watch list)
+ * 		inotify_handle->mutex (protects inotify_handle's watch list)
+ * 			inotify_dev->ev_mutex (protects device's event queue)
+ */
 
 /*
- * When inotify registers a new group it increments this and uses that
- * value as an offset to set the fsnotify group "name" and priority.
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
  */
-static atomic_t inotify_grp_num;
+
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct mutex		ev_mutex;	/* protects event queue */
+	struct mutex		up_mutex;	/* synchronizes watch updates */
+	struct list_head 	events;		/* list of queued events */
+	struct user_struct	*user;		/* user who opened this dev */
+	struct inotify_handle	*ih;		/* inotify handle */
+	struct fasync_struct    *fa;            /* async notification */
+	atomic_t		count;		/* reference count */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+	struct inotify_device	*dev;	/* associated device */
+	struct inotify_watch	wdata;	/* inotify watch data */
+};
 
 #ifdef CONFIG_SYSCTL
 
@@ -106,36 +149,280 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static inline __u32 inotify_arg_to_mask(u32 arg)
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	atomic_dec(&dev->user->inotify_watches);
+	put_inotify_dev(dev);
+	kmem_cache_free(watch_cachep, watch);
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_NOFS);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_get_last_event - return the last event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_last_event(struct inotify_device *dev)
 {
-	__u32 mask;
+	if (list_empty(&dev->events))
+		return NULL;
+	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
+}
 
-	/* everything should accept their own ignored and cares about children */
-	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+				    u32 cookie, const char *name,
+				    struct inode *ignored)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+	struct inotify_kernel_event *kevent, *last;
 
-	/* mask off the flags used to open the fd */
-	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
 
-	return mask;
+	mutex_lock(&dev->ev_mutex);
+
+	/* we can safely put the watch as we don't reference it while
+	 * generating the event
+	 */
+	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
+		put_inotify_watch(w); /* final put */
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_last_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			goto out;
+		if (name && lastname && !strcmp(lastname, name))
+			goto out;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		goto out;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		goto out;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+	kill_fasync(&dev->fa, SIGIO, POLL_IN);
+
+out:
+	mutex_unlock(&dev->ev_mutex);
+}
+
+/*
+ * remove_kevent - cleans up the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
+
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+		free_kevent(kevent);
+	}
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int find_inode(const char __user *dirname, struct path *path,
+		      unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
 }
 
-static inline u32 inotify_mask_to_arg(__u32 mask)
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+			u32 mask)
 {
-	return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
-		       IN_Q_OVERFLOW);
+	struct inotify_user_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >=
+			inotify_max_user_watches)
+		return -ENOSPC;
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return -ENOMEM;
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	inotify_init_watch(&watch->wdata);
+	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+	if (ret < 0)
+		free_inotify_user_watch(&watch->wdata);
+
+	return ret;
 }
 
-/* intofiy userspace file descriptor functions */
+/* Device Interface */
+
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-	struct fsnotify_group *group = file->private_data;
+	struct inotify_device *dev = file->private_data;
 	int ret = 0;
 
-	poll_wait(file, &group->notification_waitq, wait);
-	mutex_lock(&group->notification_mutex);
-	if (!fsnotify_notify_queue_is_empty(group))
+	poll_wait(file, &dev->wq, wait);
+	mutex_lock(&dev->ev_mutex);
+	if (!list_empty(&dev->events))
 		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&group->notification_mutex);
+	mutex_unlock(&dev->ev_mutex);
 
 	return ret;
 }
@@ -145,29 +432,26 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
  * enough to fit in "count". Return an error pointer if
  * not large enough.
  *
- * Called with the group->notification_mutex held.
+ * Called with the device ev_mutex held.
  */
-static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
-					    size_t count)
+static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+						  size_t count)
 {
 	size_t event_size = sizeof(struct inotify_event);
-	struct fsnotify_event *event;
+	struct inotify_kernel_event *kevent;
 
-	if (fsnotify_notify_queue_is_empty(group))
+	if (list_empty(&dev->events))
 		return NULL;
 
-	event = fsnotify_peek_notify_event(group);
-
-	event_size += roundup(event->name_len, event_size);
+	kevent = inotify_dev_get_event(dev);
+	if (kevent->name)
+		event_size += kevent->event.len;
 
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
-	/* held the notification_mutex the whole time, so this is the
-	 * same event we peeked above */
-	fsnotify_remove_notify_event(group);
-
-	return event;
+	remove_kevent(dev, kevent);
+	return kevent;
 }
 
 /*
@@ -176,90 +460,51 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
  * We already checked that the event size is smaller than the
  * buffer we had in "get_one_event()" above.
  */
-static ssize_t copy_event_to_user(struct fsnotify_group *group,
-				  struct fsnotify_event *event,
+static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
 				  char __user *buf)
 {
-	struct inotify_event inotify_event;
-	struct fsnotify_event_private_data *fsn_priv;
-	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
-	size_t name_len;
-
-	/* we get the inotify watch descriptor from the event private data */
-	spin_lock(&event->lock);
-	fsn_priv = fsnotify_remove_priv_from_event(group, event);
-	spin_unlock(&event->lock);
-
-	if (!fsn_priv)
-		inotify_event.wd = -1;
-	else {
-		priv = container_of(fsn_priv, struct inotify_event_private_data,
-				    fsnotify_event_priv_data);
-		inotify_event.wd = priv->wd;
-		inotify_free_event_priv(fsn_priv);
-	}
-
-	/* round up event->name_len so it is a multiple of event_size */
-	name_len = roundup(event->name_len, event_size);
-	inotify_event.len = name_len;
-
-	inotify_event.mask = inotify_mask_to_arg(event->mask);
-	inotify_event.cookie = event->sync_cookie;
 
-	/* send the main event */
-	if (copy_to_user(buf, &inotify_event, event_size))
+	if (copy_to_user(buf, &kevent->event, event_size))
 		return -EFAULT;
 
-	buf += event_size;
+	if (kevent->name) {
+		buf += event_size;
 
-	/*
-	 * fsnotify only stores the pathname, so here we have to send the pathname
-	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
-	 * with zeros.  I get my zeros from the nul_inotify_event.
-	 */
-	if (name_len) {
-		unsigned int len_to_zero = name_len - event->name_len;
-		/* copy the path name */
-		if (copy_to_user(buf, event->file_name, event->name_len))
+		if (copy_to_user(buf, kevent->name, kevent->event.len))
 			return -EFAULT;
-		buf += event->name_len;
 
-		/* fill userspace with 0's from nul_inotify_event */
-		if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
-			return -EFAULT;
-		buf += len_to_zero;
-		event_size += name_len;
+		event_size += kevent->event.len;
 	}
-
 	return event_size;
 }
 
 static ssize_t inotify_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *pos)
 {
-	struct fsnotify_group *group;
-	struct fsnotify_event *kevent;
+	struct inotify_device *dev;
 	char __user *start;
 	int ret;
 	DEFINE_WAIT(wait);
 
 	start = buf;
-	group = file->private_data;
+	dev = file->private_data;
 
 	while (1) {
-		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
+		struct inotify_kernel_event *kevent;
 
-		mutex_lock(&group->notification_mutex);
-		kevent = get_one_event(group, count);
-		mutex_unlock(&group->notification_mutex);
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&dev->ev_mutex);
+		kevent = get_one_event(dev, count);
+		mutex_unlock(&dev->ev_mutex);
 
 		if (kevent) {
 			ret = PTR_ERR(kevent);
 			if (IS_ERR(kevent))
 				break;
-			ret = copy_event_to_user(group, kevent, buf);
-			fsnotify_put_event(kevent);
+			ret = copy_event_to_user(kevent, buf);
+			free_kevent(kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
@@ -280,7 +525,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 		schedule();
 	}
 
-	finish_wait(&group->notification_waitq, &wait);
+	finish_wait(&dev->wq, &wait);
 	if (start != buf && ret != -EFAULT)
 		ret = buf - start;
 	return ret;
@@ -288,19 +533,25 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-	struct fsnotify_group *group = file->private_data;
+	struct inotify_device *dev = file->private_data;
 
-	return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
+	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
 }
 
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-	struct fsnotify_group *group = file->private_data;
+	struct inotify_device *dev = file->private_data;
+
+	inotify_destroy(dev->ih);
 
-	fsnotify_clear_marks_by_group(group);
+	/* destroy all of the events on this device */
+	mutex_lock(&dev->ev_mutex);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	mutex_unlock(&dev->ev_mutex);
 
-	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
-	fsnotify_put_group(group);
+	/* free this device: the put matching the get in inotify_init() */
+	put_inotify_dev(dev);
 
 	return 0;
 }
@@ -308,27 +559,16 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
 			  unsigned long arg)
 {
-	struct fsnotify_group *group;
-	struct fsnotify_event_holder *holder;
-	struct fsnotify_event *event;
+	struct inotify_device *dev;
 	void __user *p;
 	int ret = -ENOTTY;
-	size_t send_len = 0;
 
-	group = file->private_data;
+	dev = file->private_data;
 	p = (void __user *) arg;
 
 	switch (cmd) {
 	case FIONREAD:
-		mutex_lock(&group->notification_mutex);
-		list_for_each_entry(holder, &group->notification_list, event_list) {
-			event = holder->event;
-			send_len += sizeof(struct inotify_event);
-			send_len += roundup(event->name_len,
-					     sizeof(struct inotify_event));
-		}
-		mutex_unlock(&group->notification_mutex);
-		ret = put_user(send_len, (int __user *) p);
+		ret = put_user(dev->queue_size, (int __user *) p);
 		break;
 	}
 
@@ -336,233 +576,23 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 
 static const struct file_operations inotify_fops = {
-	.poll		= inotify_poll,
-	.read		= inotify_read,
-	.fasync		= inotify_fasync,
-	.release	= inotify_release,
-	.unlocked_ioctl	= inotify_ioctl,
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.fasync         = inotify_fasync,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
 };
 
+static const struct inotify_operations inotify_user_ops = {
+	.handle_event	= inotify_dev_queue_event,
+	.destroy_watch	= free_inotify_user_watch,
+};
 
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
-{
-	int error;
-
-	error = user_path_at(AT_FDCWD, dirname, flags, path);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
-	if (error)
-		path_put(path);
-	return error;
-}
-
-/*
- * When, for whatever reason, inotify is done with a mark (or what used to be a
- * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
- * for the given wd.
- *
- * There is a bit of recursion here.  The loop looks like:
- * 	inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
- *	inotify_freeing_mark -> inotify_destory_mark_entry -> restart
- * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
- * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
- * test below will not call back to fsnotify again.  But even if that test wasn't
- * there this would still be safe since fsnotify_destroy_mark_by_entry() is
- * safe from recursion.
- */
-void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
-{
-	struct inotify_inode_mark_entry *ientry;
-	struct inotify_event_private_data *event_priv;
-	struct fsnotify_event_private_data *fsn_event_priv;
-	struct fsnotify_group *egroup;
-	struct idr *idr;
-
-	spin_lock(&entry->lock);
-	egroup = entry->group;
-
-	/* if egroup we aren't really done and something might still send events
-	 * for this inode, on the callback we'll send the IN_IGNORED */
-	if (egroup) {
-		spin_unlock(&entry->lock);
-		fsnotify_destroy_mark_by_entry(entry);
-		return;
-	}
-	spin_unlock(&entry->lock);
-
-	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-
-	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
-	if (unlikely(!event_priv))
-		goto skip_send_ignore;
-
-	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
-	fsn_event_priv->group = group;
-	event_priv->wd = ientry->wd;
-
-	fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
-
-	/* did the private data get added? */
-	if (list_empty(&fsn_event_priv->event_list))
-		inotify_free_event_priv(fsn_event_priv);
-
-skip_send_ignore:
-
-	/* remove this entry from the idr */
-	spin_lock(&group->inotify_data.idr_lock);
-	idr = &group->inotify_data.idr;
-	idr_remove(idr, ientry->wd);
-	spin_unlock(&group->inotify_data.idr_lock);
-
-	/* removed from idr, drop that reference */
-	fsnotify_put_mark(entry);
-}
-
-/* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark_entry *entry)
-{
-	struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
-
-	kmem_cache_free(inotify_inode_mark_cachep, ientry);
-}
-
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
-{
-	struct fsnotify_mark_entry *entry = NULL;
-	struct inotify_inode_mark_entry *ientry;
-	int ret = 0;
-	int add = (arg & IN_MASK_ADD);
-	__u32 mask;
-	__u32 old_mask, new_mask;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask = inotify_arg_to_mask(arg);
-	if (unlikely(!mask))
-		return -EINVAL;
-
-	ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!ientry))
-		return -ENOMEM;
-	/* we set the mask at the end after attaching it */
-	fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
-	ientry->wd = 0;
-
-find_entry:
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
-	spin_unlock(&inode->i_lock);
-	if (entry) {
-		kmem_cache_free(inotify_inode_mark_cachep, ientry);
-		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-	} else {
-		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
-			ret = -ENOSPC;
-			goto out_err;
-		}
-
-		ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-		if (ret == -EEXIST)
-			goto find_entry;
-		else if (ret)
-			goto out_err;
-
-		entry = &ientry->fsn_entry;
-retry:
-		ret = -ENOMEM;
-		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
-			goto out_err;
-
-		spin_lock(&group->inotify_data.idr_lock);
-		/* if entry is added to the idr we keep the reference obtained
-		 * through fsnotify_mark_add.  remember to drop this reference
-		 * when entry is removed from idr */
-		ret = idr_get_new_above(&group->inotify_data.idr, entry,
-					++group->inotify_data.last_wd,
-					&ientry->wd);
-		spin_unlock(&group->inotify_data.idr_lock);
-		if (ret) {
-			if (ret == -EAGAIN)
-				goto retry;
-			goto out_err;
-		}
-		atomic_inc(&group->inotify_data.user->inotify_watches);
-	}
-
-	spin_lock(&entry->lock);
-
-	old_mask = entry->mask;
-	if (add) {
-		entry->mask |= mask;
-		new_mask = entry->mask;
-	} else {
-		entry->mask = mask;
-		new_mask = entry->mask;
-	}
-
-	spin_unlock(&entry->lock);
-
-	if (old_mask != new_mask) {
-		/* more bits in old than in new? */
-		int dropped = (old_mask & ~new_mask);
-		/* more bits in this entry than the inode's mask? */
-		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
-		/* more bits in this entry than the group? */
-		int do_group = (new_mask & ~group->mask);
-
-		/* update the inode with this new entry */
-		if (dropped || do_inode)
-			fsnotify_recalc_inode_mask(inode);
-
-		/* update the group mask with the new mask */
-		if (dropped || do_group)
-			fsnotify_recalc_group_mask(group);
-	}
-
-	return ientry->wd;
-
-out_err:
-	/* see this isn't supposed to happen, just kill the watch */
-	if (entry) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
-	}
-	return ret;
-}
-
-static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
-{
-	struct fsnotify_group *group;
-	unsigned int grp_num;
-
-	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-	grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
-	group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
-	if (IS_ERR(group))
-		return group;
-
-	group->max_events = max_events;
-
-	spin_lock_init(&group->inotify_data.idr_lock);
-	idr_init(&group->inotify_data.idr);
-	group->inotify_data.last_wd = 0;
-	group->inotify_data.user = user;
-	group->inotify_data.fa = NULL;
-
-	return group;
-}
-
-
-/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-	struct fsnotify_group *group;
+	struct inotify_device *dev;
+	struct inotify_handle *ih;
 	struct user_struct *user;
 	struct file *filp;
 	int fd, ret;
@@ -591,27 +621,45 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 		goto out_free_uid;
 	}
 
-	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-	group = inotify_new_group(user, inotify_max_queued_events);
-	if (IS_ERR(group)) {
-		ret = PTR_ERR(group);
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
 		goto out_free_uid;
 	}
 
+	ih = inotify_init(&inotify_user_ops);
+	if (IS_ERR(ih)) {
+		ret = PTR_ERR(ih);
+		goto out_free_dev;
+	}
+	dev->ih = ih;
+	dev->fa = NULL;
+
 	filp->f_op = &inotify_fops;
 	filp->f_path.mnt = mntget(inotify_mnt);
 	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
 	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 	filp->f_mode = FMODE_READ;
 	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = group;
-
+	filp->private_data = dev;
+
+	INIT_LIST_HEAD(&dev->events);
+	init_waitqueue_head(&dev->wq);
+	mutex_init(&dev->ev_mutex);
+	mutex_init(&dev->up_mutex);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
 	atomic_inc(&user->inotify_devs);
-
 	fd_install(fd, filp);
 
 	return fd;
-
+out_free_dev:
+	kfree(dev);
 out_free_uid:
 	free_uid(user);
 	put_filp(filp);
@@ -628,8 +676,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		u32, mask)
 {
-	struct fsnotify_group *group;
 	struct inode *inode;
+	struct inotify_device *dev;
 	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
@@ -650,20 +698,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = inotify_find_inode(pathname, &path, flags);
-	if (ret)
+	ret = find_inode(pathname, &path, flags);
+	if (unlikely(ret))
 		goto fput_and_out;
 
-	/* inode held in place by reference to path; group by fget on fd */
+	/* inode held in place by reference to path; dev by fget on fd */
 	inode = path.dentry->d_inode;
-	group = filp->private_data;
+	dev = filp->private_data;
 
-	/* create/update an inode mark */
-	ret = inotify_update_watch(group, inode, mask);
-	if (unlikely(ret))
-		goto path_put_and_out;
+	mutex_lock(&dev->up_mutex);
+	ret = inotify_find_update_watch(dev->ih, inode, mask);
+	if (ret == -ENOENT)
+		ret = create_watch(dev, inode, mask);
+	mutex_unlock(&dev->up_mutex);
 
-path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
@@ -672,10 +720,9 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
-	struct fsnotify_group *group;
-	struct fsnotify_mark_entry *entry;
 	struct file *filp;
-	int ret = 0, fput_needed;
+	struct inotify_device *dev;
+	int ret, fput_needed;
 
 	filp = fget_light(fd, &fput_needed);
 	if (unlikely(!filp))
@@ -687,20 +734,10 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 		goto out;
 	}
 
-	group = filp->private_data;
+	dev = filp->private_data;
 
-	spin_lock(&group->inotify_data.idr_lock);
-	entry = idr_find(&group->inotify_data.idr, wd);
-	if (unlikely(!entry)) {
-		spin_unlock(&group->inotify_data.idr_lock);
-		ret = -EINVAL;
-		goto out;
-	}
-	fsnotify_get_mark(entry);
-	spin_unlock(&group->inotify_data.idr_lock);
-
-	inotify_destroy_mark_entry(entry, group);
-	fsnotify_put_mark(entry);
+	/* we free our watch data when we get IN_IGNORED */
+	ret = inotify_rm_wd(dev->ih, wd);
 
 out:
 	fput_light(filp, fput_needed);
@@ -716,9 +753,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 
 static struct file_system_type inotify_fs_type = {
-    .name	= "inotifyfs",
-    .get_sb	= inotify_get_sb,
-    .kill_sb	= kill_anon_super,
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
 };
 
 /*
@@ -738,16 +775,18 @@ static int __init inotify_user_setup(void)
 	if (IS_ERR(inotify_mnt))
 		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
 
-	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
-	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-	inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-	if (!inotify_ignored_event)
-		panic("unable to allocate the inotify ignored event\n");
-
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
 	inotify_max_user_watches = 8192;
 
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_user_watch),
+					 0, SLAB_PANIC, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL);
+
 	return 0;
 }
+
 module_init(inotify_user_setup);
diff --git a/trunk/fs/notify/notification.c b/trunk/fs/notify/notification.c
deleted file mode 100644
index 959b73e756fd..000000000000
--- a/trunk/fs/notify/notification.c
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * Basic idea behind the notification queue: An fsnotify group (like inotify)
- * sends the userspace notification about events asyncronously some time after
- * the event happened.  When inotify gets an event it will need to add that
- * event to the group notify queue.  Since a single event might need to be on
- * multiple group's notification queues we can't add the event directly to each
- * queue and instead add a small "event_holder" to each queue.  This event_holder
- * has a pointer back to the original event.  Since the majority of events are
- * going to end up on one, and only one, notification queue we embed one
- * event_holder into each event.  This means we have a single allocation instead
- * of always needing two.  If the embedded event_holder is already in use by
- * another group a new event_holder (from fsnotify_event_holder_cachep) will be
- * allocated and used.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/namei.h>
-#include <linux/path.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-
-#include <asm/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-static struct kmem_cache *fsnotify_event_cachep;
-static struct kmem_cache *fsnotify_event_holder_cachep;
-/*
- * This is a magic event we send when the q is too full.  Since it doesn't
- * hold real event information we just keep one system wide and use it any time
- * it is needed.  It's refcnt is set 1 at kernel init time and will never
- * get set to 0 so it will never get 'freed'
- */
-static struct fsnotify_event q_overflow_event;
-static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
-
-/**
- * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
- * Called from fsnotify_move, which is inlined into filesystem modules.
- */
-u32 fsnotify_get_cookie(void)
-{
-	return atomic_inc_return(&fsnotify_sync_cookie);
-}
-EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
-
-/* return true if the notify queue is empty, false otherwise */
-bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
-{
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
-	return list_empty(&group->notification_list) ? true : false;
-}
-
-void fsnotify_get_event(struct fsnotify_event *event)
-{
-	atomic_inc(&event->refcnt);
-}
-
-void fsnotify_put_event(struct fsnotify_event *event)
-{
-	if (!event)
-		return;
-
-	if (atomic_dec_and_test(&event->refcnt)) {
-		if (event->data_type == FSNOTIFY_EVENT_PATH)
-			path_put(&event->path);
-
-		BUG_ON(!list_empty(&event->private_data_list));
-
-		kfree(event->file_name);
-		kmem_cache_free(fsnotify_event_cachep, event);
-	}
-}
-
-struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
-{
-	return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
-}
-
-void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
-{
-	kmem_cache_free(fsnotify_event_holder_cachep, holder);
-}
-
-/*
- * Find the private data that the group previously attached to this event when
- * the group added the event to the notification queue (fsnotify_add_notify_event)
- */
-struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
-	struct fsnotify_event_private_data *lpriv;
-	struct fsnotify_event_private_data *priv = NULL;
-
-	assert_spin_locked(&event->lock);
-
-	list_for_each_entry(lpriv, &event->private_data_list, event_list) {
-		if (lpriv->group == group) {
-			priv = lpriv;
-			list_del(&priv->event_list);
-			break;
-		}
-	}
-	return priv;
-}
-
-/*
- * Check if 2 events contain the same information.  We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
- */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
-{
-	if ((old->mask == new->mask) &&
-	    (old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type)) {
-		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_INODE):
-			if (old->inode == new->inode)
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_PATH):
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
-				return true;
-		case (FSNOTIFY_EVENT_NONE):
-			return true;
-		};
-	}
-	return false;
-}
-
-/*
- * Add an event to the group notification queue.  The group can later pull this
- * event off the queue to deal with.  If the event is successfully added to the
- * group's notification queue, a reference is taken on event.
- */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-			      struct fsnotify_event_private_data *priv)
-{
-	struct fsnotify_event_holder *holder = NULL;
-	struct list_head *list = &group->notification_list;
-	struct fsnotify_event_holder *last_holder;
-	struct fsnotify_event *last_event;
-
-	/* easy to tell if priv was attached to the event */
-	INIT_LIST_HEAD(&priv->event_list);
-
-	/*
-	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
-	 * Check if we expect to be able to use that holder.  If not alloc a new
-	 * holder.
-	 * For the overflow event it's possible that something will use the in
-	 * event holder before we get the lock so we may need to jump back and
-	 * alloc a new holder, this can't happen for most events...
-	 */
-	if (!list_empty(&event->holder.event_list)) {
-alloc_holder:
-		holder = fsnotify_alloc_event_holder();
-		if (!holder)
-			return -ENOMEM;
-	}
-
-	mutex_lock(&group->notification_mutex);
-
-	if (group->q_len >= group->max_events) {
-		event = &q_overflow_event;
-		/* sorry, no private data on the overflow event */
-		priv = NULL;
-	}
-
-	spin_lock(&event->lock);
-
-	if (list_empty(&event->holder.event_list)) {
-		if (unlikely(holder))
-			fsnotify_destroy_event_holder(holder);
-		holder = &event->holder;
-	} else if (unlikely(!holder)) {
-		/* between the time we checked above and got the lock the in
-		 * event holder was used, go back and get a new one */
-		spin_unlock(&event->lock);
-		mutex_unlock(&group->notification_mutex);
-		goto alloc_holder;
-	}
-
-	if (!list_empty(list)) {
-		last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
-		last_event = last_holder->event;
-		if (event_compare(last_event, event)) {
-			spin_unlock(&event->lock);
-			mutex_unlock(&group->notification_mutex);
-			if (holder != &event->holder)
-				fsnotify_destroy_event_holder(holder);
-			return -EEXIST;
-		}
-	}
-
-	group->q_len++;
-	holder->event = event;
-
-	fsnotify_get_event(event);
-	list_add_tail(&holder->event_list, list);
-	if (priv)
-		list_add_tail(&priv->event_list, &event->private_data_list);
-	spin_unlock(&event->lock);
-	mutex_unlock(&group->notification_mutex);
-
-	wake_up(&group->notification_waitq);
-	return 0;
-}
-
-/*
- * Remove and return the first event from the notification list.  There is a
- * reference held on this event since it was on the list.  It is the responsibility
- * of the caller to drop this reference.
- */
-struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
-{
-	struct fsnotify_event *event;
-	struct fsnotify_event_holder *holder;
-
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
-
-	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
-
-	event = holder->event;
-
-	spin_lock(&event->lock);
-	holder->event = NULL;
-	list_del_init(&holder->event_list);
-	spin_unlock(&event->lock);
-
-	/* event == holder means we are referenced through the in event holder */
-	if (holder != &event->holder)
-		fsnotify_destroy_event_holder(holder);
-
-	group->q_len--;
-
-	return event;
-}
-
-/*
- * This will not remove the event, that must be done with fsnotify_remove_notify_event()
- */
-struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
-{
-	struct fsnotify_event *event;
-	struct fsnotify_event_holder *holder;
-
-	BUG_ON(!mutex_is_locked(&group->notification_mutex));
-
-	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
-	event = holder->event;
-
-	return event;
-}
-
-/*
- * Called when a group is being torn down to clean up any outstanding
- * event notifications.
- */
-void fsnotify_flush_notify(struct fsnotify_group *group)
-{
-	struct fsnotify_event *event;
-	struct fsnotify_event_private_data *priv;
-
-	mutex_lock(&group->notification_mutex);
-	while (!fsnotify_notify_queue_is_empty(group)) {
-		event = fsnotify_remove_notify_event(group);
-		/* if they don't implement free_event_priv they better not have attached any */
-		if (group->ops->free_event_priv) {
-			spin_lock(&event->lock);
-			priv = fsnotify_remove_priv_from_event(group, event);
-			spin_unlock(&event->lock);
-			if (priv)
-				group->ops->free_event_priv(priv);
-		}
-		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
-	}
-	mutex_unlock(&group->notification_mutex);
-}
-
-static void initialize_event(struct fsnotify_event *event)
-{
-	event->holder.event = NULL;
-	INIT_LIST_HEAD(&event->holder.event_list);
-	atomic_set(&event->refcnt, 1);
-
-	spin_lock_init(&event->lock);
-
-	event->path.dentry = NULL;
-	event->path.mnt = NULL;
-	event->inode = NULL;
-	event->data_type = FSNOTIFY_EVENT_NONE;
-
-	INIT_LIST_HEAD(&event->private_data_list);
-
-	event->to_tell = NULL;
-
-	event->file_name = NULL;
-	event->name_len = 0;
-
-	event->sync_cookie = 0;
-}
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @to_tell the inode which is supposed to receive the event (sometimes a
- *	parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-					     int data_type, const char *name, u32 cookie)
-{
-	struct fsnotify_event *event;
-
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-	if (!event)
-		return NULL;
-
-	initialize_event(event);
-
-	if (name) {
-		event->file_name = kstrdup(name, GFP_KERNEL);
-		if (!event->file_name) {
-			kmem_cache_free(fsnotify_event_cachep, event);
-			return NULL;
-		}
-		event->name_len = strlen(event->file_name);
-	}
-
-	event->sync_cookie = cookie;
-	event->to_tell = to_tell;
-
-	switch (data_type) {
-	case FSNOTIFY_EVENT_FILE: {
-		struct file *file = data;
-		struct path *path = &file->f_path;
-		event->path.dentry = path->dentry;
-		event->path.mnt = path->mnt;
-		path_get(&event->path);
-		event->data_type = FSNOTIFY_EVENT_PATH;
-		break;
-	}
-	case FSNOTIFY_EVENT_PATH: {
-		struct path *path = data;
-		event->path.dentry = path->dentry;
-		event->path.mnt = path->mnt;
-		path_get(&event->path);
-		event->data_type = FSNOTIFY_EVENT_PATH;
-		break;
-	}
-	case FSNOTIFY_EVENT_INODE:
-		event->inode = data;
-		event->data_type = FSNOTIFY_EVENT_INODE;
-		break;
-	case FSNOTIFY_EVENT_NONE:
-		event->inode = NULL;
-		event->path.dentry = NULL;
-		event->path.mnt = NULL;
-		break;
-	default:
-		BUG();
-	}
-
-	event->mask = mask;
-
-	return event;
-}
-
-__init int fsnotify_notification_init(void)
-{
-	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
-	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-
-	initialize_event(&q_overflow_event);
-	q_overflow_event.mask = FS_Q_OVERFLOW;
-
-	return 0;
-}
-subsys_initcall(fsnotify_notification_init);
-
diff --git a/trunk/fs/ntfs/super.c b/trunk/fs/ntfs/super.c
index abaaa1cbf8de..6aa7c4713536 100644
--- a/trunk/fs/ntfs/super.c
+++ b/trunk/fs/ntfs/super.c
@@ -443,8 +443,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering with remount options string: %s", opt);
-
-	lock_kernel();
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
@@ -468,18 +466,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 		if (NVolErrors(vol)) {
 			ntfs_error(sb, "Volume has errors and is read-only%s",
 					es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_IS_DIRTY) {
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
 			ntfs_error(sb, "Volume has been modified by chkdsk "
 					"and is read-only%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -487,13 +482,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 					"(0x%x) and is read-only%s",
 					(unsigned)le16_to_cpu(vol->vol_flags),
 					es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
 			ntfs_error(sb, "Failed to set dirty bit in volume "
 					"information flags%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 #if 0
@@ -513,21 +506,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Failed to empty journal $LogFile%s",
 					es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_mark_quotas_out_of_date(vol)) {
 			ntfs_error(sb, "Failed to mark quotas out of date%s",
 					es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_stamp_usnjrnl(vol)) {
 			ntfs_error(sb, "Failed to stamp transation log "
 					"($UsnJrnl)%s", es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -543,11 +533,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	// TODO: Deal with *flags.
 
-	if (!parse_options(vol, opt)) {
-		unlock_kernel();
+	if (!parse_options(vol, opt))
 		return -EINVAL;
-	}
-	unlock_kernel();
 	ntfs_debug("Done.");
 	return 0;
 }
@@ -2259,9 +2246,6 @@ static void ntfs_put_super(struct super_block *sb)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering.");
-
-	lock_kernel();
-
 #ifdef NTFS_RW
 	/*
 	 * Commit all inodes while they are still open in case some of them
@@ -2389,12 +2373,39 @@ static void ntfs_put_super(struct super_block *sb)
 		vol->mftmirr_ino = NULL;
 	}
 	/*
-	 * We should have no dirty inodes left, due to
-	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-	 * the underlying mft records are written out and cleaned.
+	 * If any dirty inodes are left, throw away all mft data page cache
+	 * pages to allow a clean umount.  This should never happen any more
+	 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+	 * the underlying mft records are written out and cleaned.  If it does,
+	 * happen anyway, we want to know...
 	 */
 	ntfs_commit_inode(vol->mft_ino);
 	write_inode_now(vol->mft_ino, 1);
+	if (sb_has_dirty_inodes(sb)) {
+		const char *s1, *s2;
+
+		mutex_lock(&vol->mft_ino->i_mutex);
+		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
+		mutex_unlock(&vol->mft_ino->i_mutex);
+		write_inode_now(vol->mft_ino, 1);
+		if (sb_has_dirty_inodes(sb)) {
+			static const char *_s1 = "inodes";
+			static const char *_s2 = "";
+			s1 = _s1;
+			s2 = _s2;
+		} else {
+			static const char *_s1 = "mft pages";
+			static const char *_s2 = "They have been thrown "
+					"away.  ";
+			s1 = _s1;
+			s2 = _s2;
+		}
+		ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
+				"run chkdsk.  Please email "
+				"linux-ntfs-dev@lists.sourceforge.net and say "
+				"that you saw this message.  Thank you.", s1,
+				s2);
+	}
 #endif /* NTFS_RW */
 
 	iput(vol->mft_ino);
@@ -2433,8 +2444,7 @@ static void ntfs_put_super(struct super_block *sb)
 	}
 	sb->s_fs_info = NULL;
 	kfree(vol);
-
-	unlock_kernel();
+	return;
 }
 
 /**
diff --git a/trunk/fs/ocfs2/super.c b/trunk/fs/ocfs2/super.c
index 201b40a441fe..5c6163f55039 100644
--- a/trunk/fs/ocfs2/super.c
+++ b/trunk/fs/ocfs2/super.c
@@ -42,7 +42,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -127,6 +126,7 @@ static int ocfs2_get_sector(struct super_block *sb,
 			    struct buffer_head **bh,
 			    int block,
 			    int sect_size);
+static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,6 +141,7 @@ static const struct super_operations ocfs2_sops = {
 	.clear_inode	= ocfs2_clear_inode,
 	.delete_inode	= ocfs2_delete_inode,
 	.sync_fs	= ocfs2_sync_fs,
+	.write_super	= ocfs2_write_super,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
@@ -364,12 +365,24 @@ static struct file_operations ocfs2_osb_debug_fops = {
 	.llseek =	generic_file_llseek,
 };
 
+/*
+ * write_super and sync_fs ripped right out of ext3.
+ */
+static void ocfs2_write_super(struct super_block *sb)
+{
+	if (mutex_trylock(&sb->s_lock) != 0)
+		BUG();
+	sb->s_dirt = 0;
+}
+
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
 	int status;
 	tid_t target;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
+	sb->s_dirt = 0;
+
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
@@ -582,8 +595,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	lock_kernel();
-
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
 		ret = -EINVAL;
 		goto out;
@@ -687,7 +698,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			ocfs2_set_journal_params(osb);
 	}
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -1540,13 +1550,9 @@ static void ocfs2_put_super(struct super_block *sb)
 {
 	mlog_entry("(0x%p)\n", sb);
 
-	lock_kernel();
-
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
 
-	unlock_kernel();
-
 	mlog_exit_void();
 }
 
diff --git a/trunk/fs/omfs/file.c b/trunk/fs/omfs/file.c
index d17e774eaf45..834b2331f6b3 100644
--- a/trunk/fs/omfs/file.c
+++ b/trunk/fs/omfs/file.c
@@ -11,6 +11,21 @@
 #include <linux/mpage.h>
 #include "omfs.h"
 
+static int omfs_sync_file(struct file *file, struct dentry *dentry,
+		int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return err;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return err;
+	err |= omfs_sync_inode(inode);
+	return err ? -EIO : 0;
+}
+
 static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
 {
 	return (sbi->s_sys_blocksize - offset -
@@ -329,7 +344,7 @@ struct file_operations omfs_file_operations = {
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.mmap = generic_file_mmap,
-	.fsync = simple_fsync,
+	.fsync = omfs_sync_file,
 	.splice_read = generic_file_splice_read,
 };
 
diff --git a/trunk/fs/open.c b/trunk/fs/open.c
index 7200e23d9258..bdfbf03615a4 100644
--- a/trunk/fs/open.c
+++ b/trunk/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 
 	audit_inode(NULL, dentry);
 
-	err = mnt_want_write_file(file);
+	err = mnt_want_write(file->f_path.mnt);
 	if (err)
 		goto out_putf;
 	mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	if (!file)
 		goto out;
 
-	error = mnt_want_write_file(file);
+	error = mnt_want_write(file->f_path.mnt);
 	if (error)
 		goto out_fput;
 	dentry = file->f_path.dentry;
diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h
index 753ca37002c8..f6db9618a888 100644
--- a/trunk/fs/proc/internal.h
+++ b/trunk/fs/proc/internal.h
@@ -92,28 +92,3 @@ struct pde_opener {
 	struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
-
-extern spinlock_t proc_subdir_lock;
-
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
-void task_mem(struct seq_file *, struct mm_struct *);
-
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
-void de_put(struct proc_dir_entry *de);
-
-extern struct vfsmount *proc_mnt;
-int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
-
-/*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
- */
-int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/trunk/fs/proc/proc_devtree.c b/trunk/fs/proc/proc_devtree.c
index fc6c3025befd..de2bba5a3440 100644
--- a/trunk/fs/proc/proc_devtree.c
+++ b/trunk/fs/proc/proc_devtree.c
@@ -11,7 +11,6 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
-#include "internal.h"
 
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
diff --git a/trunk/fs/qnx4/Makefile b/trunk/fs/qnx4/Makefile
index e4d408cc5473..502d7fe98bab 100644
--- a/trunk/fs/qnx4/Makefile
+++ b/trunk/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
 
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
diff --git a/trunk/fs/qnx4/bitmap.c b/trunk/fs/qnx4/bitmap.c
index e1cd061a25f7..8425cf6e9624 100644
--- a/trunk/fs/qnx4/bitmap.c
+++ b/trunk/fs/qnx4/bitmap.c
@@ -13,9 +13,14 @@
  * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
  */
 
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#include <linux/stat.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
-#include "qnx4.h"
 
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/trunk/fs/qnx4/dir.c b/trunk/fs/qnx4/dir.c
index 003c68f3238b..ea9ffefb48ad 100644
--- a/trunk/fs/qnx4/dir.c
+++ b/trunk/fs/qnx4/dir.c
@@ -11,9 +11,14 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
-#include "qnx4.h"
+
 
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -79,7 +84,7 @@ const struct file_operations qnx4_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= file_fsync,
 };
 
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/trunk/fs/qnx4/file.c b/trunk/fs/qnx4/file.c
index 09b170ac936c..867f42b02035 100644
--- a/trunk/fs/qnx4/file.c
+++ b/trunk/fs/qnx4/file.c
@@ -12,7 +12,8 @@
  * 27-06-1998 by Frank Denis : file overwriting.
  */
 
-#include "qnx4.h"
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
 
 /*
  * We have mostly NULL's here: the current defaults are ok for
@@ -28,7 +29,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
-	.fsync		= simple_fsync,
+	.fsync		= qnx4_sync_file,
 #endif
 };
 
diff --git a/trunk/fs/qnx4/fsync.c b/trunk/fs/qnx4/fsync.c
new file mode 100644
index 000000000000..aa3b19544bee
--- /dev/null
+++ b/trunk/fs/qnx4/fsync.c
@@ -0,0 +1,169 @@
+/* 
+ * QNX4 file system, Linux implementation.
+ * 
+ * Version : 0.1
+ * 
+ * Using parts of the xiafs filesystem.
+ * 
+ * History :
+ * 
+ * 24-03-1998 by Richard Frowijn : first release.
+ */
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+
+#include <asm/system.h>
+
+/*
+ * The functions for qnx4 fs file synchronization.
+ */
+
+#ifdef CONFIG_QNX4FS_RW
+
+static int sync_block(struct inode *inode, unsigned short *block, int wait)
+{
+	struct buffer_head *bh;
+	unsigned short tmp;
+
+	if (!*block)
+		return 0;
+	tmp = *block;
+	bh = sb_find_get_block(inode->i_sb, *block);
+	if (!bh)
+		return 0;
+	if (*block != tmp) {
+		brelse(bh);
+		return 1;
+	}
+	if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
+		brelse(bh);
+		return -1;
+	}
+	if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
+		brelse(bh);
+		return 0;
+	}
+	ll_rw_block(WRITE, 1, &bh);
+	atomic_dec(&bh->b_count);
+	return 0;
+}
+
+#ifdef WTF
+static int sync_iblock(struct inode *inode, unsigned short *iblock,
+		       struct buffer_head **bh, int wait)
+{
+	int rc;
+	unsigned short tmp;
+
+	*bh = NULL;
+	tmp = *iblock;
+	if (!tmp)
+		return 0;
+	rc = sync_block(inode, iblock, wait);
+	if (rc)
+		return rc;
+	*bh = sb_bread(inode->i_sb, tmp);
+	if (tmp != *iblock) {
+		brelse(*bh);
+		*bh = NULL;
+		return 1;
+	}
+	if (!*bh)
+		return -1;
+	return 0;
+}
+#endif
+
+static int sync_direct(struct inode *inode, int wait)
+{
+	int i;
+	int rc, err = 0;
+
+	for (i = 0; i < 7; i++) {
+		rc = sync_block(inode,
+				(unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
+		if (rc > 0)
+			break;
+		if (rc)
+			err = rc;
+	}
+	return err;
+}
+
+#ifdef WTF
+static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
+{
+	int i;
+	struct buffer_head *ind_bh;
+	int rc, err = 0;
+
+	rc = sync_iblock(inode, iblock, &ind_bh, wait);
+	if (rc || !ind_bh)
+		return rc;
+
+	for (i = 0; i < 512; i++) {
+		rc = sync_block(inode,
+				((unsigned short *) ind_bh->b_data) + i,
+				wait);
+		if (rc > 0)
+			break;
+		if (rc)
+			err = rc;
+	}
+	brelse(ind_bh);
+	return err;
+}
+
+static int sync_dindirect(struct inode *inode, unsigned short *diblock,
+			  int wait)
+{
+	int i;
+	struct buffer_head *dind_bh;
+	int rc, err = 0;
+
+	rc = sync_iblock(inode, diblock, &dind_bh, wait);
+	if (rc || !dind_bh)
+		return rc;
+
+	for (i = 0; i < 512; i++) {
+		rc = sync_indirect(inode,
+				((unsigned short *) dind_bh->b_data) + i,
+				   wait);
+		if (rc > 0)
+			break;
+		if (rc)
+			err = rc;
+	}
+	brelse(dind_bh);
+	return err;
+}
+#endif
+
+int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
+{
+        struct inode *inode = dentry->d_inode;
+	int wait, err = 0;
+        
+        (void) file;
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+
+	lock_kernel();
+	for (wait = 0; wait <= 1; wait++) {
+		err |= sync_direct(inode, wait);
+	}
+	err |= qnx4_sync_inode(inode);
+	unlock_kernel();
+	return (err < 0) ? -EIO : 0;
+}
+
+#endif
diff --git a/trunk/fs/qnx4/inode.c b/trunk/fs/qnx4/inode.c
index 681df5fcd161..fe1f0f31d11c 100644
--- a/trunk/fs/qnx4/inode.c
+++ b/trunk/fs/qnx4/inode.c
@@ -13,15 +13,19 @@
  */
 
 #include <linux/module.h>
-#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/statfs.h>
-#include "qnx4.h"
+#include <linux/vfs.h>
+#include <asm/uaccess.h>
 
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
@@ -30,6 +34,31 @@ static const struct super_operations qnx4_sops;
 
 #ifdef CONFIG_QNX4FS_RW
 
+int qnx4_sync_inode(struct inode *inode)
+{
+	int err = 0;
+# if 0
+	struct buffer_head *bh;
+
+   	bh = qnx4_update_inode(inode);
+	if (bh && buffer_dirty(bh))
+	{
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+		{
+			printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
+				inode->i_sb->s_id, inode->i_ino);
+			err = -1;
+		}
+	        brelse (bh);
+	} else if (!bh) {
+		err = -1;
+	}
+# endif
+
+	return err;
+}
+
 static void qnx4_delete_inode(struct inode *inode)
 {
 	QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -41,7 +70,15 @@ static void qnx4_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static int qnx4_write_inode(struct inode *inode, int do_sync)
+static void qnx4_write_super(struct super_block *sb)
+{
+	lock_kernel();
+	QNX4DEBUG(("qnx4: write_super\n"));
+	sb->s_dirt = 0;
+	unlock_kernel();
+}
+
+static int qnx4_write_inode(struct inode *inode, int unused)
 {
 	struct qnx4_inode_entry *raw_inode;
 	int block, ino;
@@ -78,16 +115,6 @@ static int qnx4_write_inode(struct inode *inode, int do_sync)
 	raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
 	mark_buffer_dirty(bh);
-	if (do_sync) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
-			printk("qnx4: IO error syncing inode [%s:%08x]\n",
-					inode->i_sb->s_id, ino);
-			brelse(bh);
-			unlock_kernel();
-			return -EIO;
-		}
-	}
 	brelse(bh);
 	unlock_kernel();
 	return 0;
@@ -111,6 +138,7 @@ static const struct super_operations qnx4_sops =
 #ifdef CONFIG_QNX4FS_RW
 	.write_inode	= qnx4_write_inode,
 	.delete_inode	= qnx4_delete_inode,
+	.write_super	= qnx4_write_super,
 #endif
 };
 
diff --git a/trunk/fs/qnx4/namei.c b/trunk/fs/qnx4/namei.c
index 5972ed214937..775eed3a4085 100644
--- a/trunk/fs/qnx4/namei.c
+++ b/trunk/fs/qnx4/namei.c
@@ -12,9 +12,16 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
-#include "qnx4.h"
 
 
 /*
@@ -180,7 +187,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty_inode(bh, dir);
+	mark_buffer_dirty(bh);
 	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -225,7 +232,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty_inode(bh, dir);
+	mark_buffer_dirty(bh);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
diff --git a/trunk/fs/qnx4/qnx4.h b/trunk/fs/qnx4/qnx4.h
deleted file mode 100644
index 9efc089454f6..000000000000
--- a/trunk/fs/qnx4/qnx4.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-
-#define QNX4_DEBUG 0
-
-#if QNX4_DEBUG
-#define QNX4DEBUG(X) printk X
-#else
-#define QNX4DEBUG(X) (void) 0
-#endif
-
-struct qnx4_sb_info {
-	struct buffer_head	*sb_buf;	/* superblock buffer */
-	struct qnx4_super_block	*sb;		/* our superblock */
-	unsigned int		Version;	/* may be useful */
-	struct qnx4_inode_entry	*BitMap;	/* useful */
-};
-
-struct qnx4_inode_info {
-	struct qnx4_inode_entry raw;
-	loff_t mmu_private;
-	struct inode vfs_inode;
-};
-
-extern struct inode *qnx4_iget(struct super_block *, unsigned long);
-extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
-extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
-extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
-
-extern struct buffer_head *qnx4_bread(struct inode *, int, int);
-
-extern const struct inode_operations qnx4_file_inode_operations;
-extern const struct inode_operations qnx4_dir_inode_operations;
-extern const struct file_operations qnx4_file_operations;
-extern const struct file_operations qnx4_dir_operations;
-extern int qnx4_is_free(struct super_block *sb, long block);
-extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
-extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
-extern void qnx4_truncate(struct inode *inode);
-extern void qnx4_free_inode(struct inode *inode);
-extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
-extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-
-static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
-{
-	return container_of(inode, struct qnx4_inode_info, vfs_inode);
-}
-
-static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
-{
-	return &qnx4_i(inode)->raw;
-}
diff --git a/trunk/fs/qnx4/truncate.c b/trunk/fs/qnx4/truncate.c
index d94d9ee241fe..6437c1c3d1dd 100644
--- a/trunk/fs/qnx4/truncate.c
+++ b/trunk/fs/qnx4/truncate.c
@@ -10,8 +10,12 @@
  * 30-06-1998 by Frank DENIS : ugly filler.
  */
 
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include "qnx4.h"
+#include <asm/uaccess.h>
 
 #ifdef CONFIG_QNX4FS_RW
 
diff --git a/trunk/fs/quota/quota.c b/trunk/fs/quota/quota.c
index 95c5b42384b2..b7f5a468f076 100644
--- a/trunk/fs/quota/quota.c
+++ b/trunk/fs/quota/quota.c
@@ -159,14 +159,10 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
 	return error;
 }
 
-#ifdef CONFIG_QUOTA
-void sync_quota_sb(struct super_block *sb, int type)
+static void quota_sync_sb(struct super_block *sb, int type)
 {
 	int cnt;
 
-	if (!sb->s_qcop->quota_sync)
-		return;
-
 	sb->s_qcop->quota_sync(sb, type);
 
 	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -195,13 +191,17 @@ void sync_quota_sb(struct super_block *sb, int type)
 	}
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
-#endif
 
-static void sync_dquots(int type)
+void sync_dquots(struct super_block *sb, int type)
 {
-	struct super_block *sb;
 	int cnt;
 
+	if (sb) {
+		if (sb->s_qcop->quota_sync)
+			quota_sync_sb(sb, type);
+		return;
+	}
+
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ static void sync_dquots(int type)
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		if (sb->s_root)
-			sync_quota_sb(sb, type);
+		if (sb->s_root && sb->s_qcop->quota_sync)
+			quota_sync_sb(sb, type);
 		up_read(&sb->s_umount);
 		spin_lock(&sb_lock);
 		if (__put_super_and_need_restart(sb))
@@ -301,10 +301,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 			return sb->s_qcop->set_dqblk(sb, type, id, &idq);
 		}
 		case Q_SYNC:
-			if (sb)
-				sync_quota_sb(sb, type);
-			else
-				sync_dquots(type);
+			sync_dquots(sb, type);
 			return 0;
 
 		case Q_XQUOTAON:
diff --git a/trunk/fs/reiserfs/dir.c b/trunk/fs/reiserfs/dir.c
index 6d2668fdc384..45ee3d357c70 100644
--- a/trunk/fs/reiserfs/dir.c
+++ b/trunk/fs/reiserfs/dir.c
@@ -44,11 +44,13 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
 				   struct reiserfs_de_head *deh)
 {
+	int ret = 0;
+#ifdef CONFIG_REISERFS_FS_XATTR
 	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	if (reiserfs_expose_privroot(dir->d_sb))
-		return 0;
-	return (dir == dir->d_parent && privroot->d_inode &&
-	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+	ret = (dir == dir->d_parent && privroot->d_inode &&
+	       deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+#endif
+	return ret;
 }
 
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/trunk/fs/reiserfs/super.c b/trunk/fs/reiserfs/super.c
index 2969773cfc22..3567fb9e3fb1 100644
--- a/trunk/fs/reiserfs/super.c
+++ b/trunk/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 
 struct file_system_type reiserfs_fs_type;
 
@@ -65,15 +64,18 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-	struct reiserfs_transaction_handle th;
-
-	reiserfs_write_lock(s);
-	if (!journal_begin(&th, s, 1))
-		if (!journal_end_sync(&th, s, 1))
-			reiserfs_flush_old_commits(s);
-	s->s_dirt = 0;	/* Even if it's not true.
-			 * We'll loop forever in sync_supers otherwise */
-	reiserfs_write_unlock(s);
+	if (!(s->s_flags & MS_RDONLY)) {
+		struct reiserfs_transaction_handle th;
+		reiserfs_write_lock(s);
+		if (!journal_begin(&th, s, 1))
+			if (!journal_end_sync(&th, s, 1))
+				reiserfs_flush_old_commits(s);
+		s->s_dirt = 0;	/* Even if it's not true.
+				 * We'll loop forever in sync_supers otherwise */
+		reiserfs_write_unlock(s);
+	} else {
+		s->s_dirt = 0;
+	}
 	return 0;
 }
 
@@ -466,11 +468,6 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
-	lock_kernel();
-
-	if (s->s_dirt)
-		reiserfs_write_super(s);
-
 	/* change file system state to current state if it was mounted with read-write permissions */
 	if (!(s->s_flags & MS_RDONLY)) {
 		if (!journal_begin(&th, s, 10)) {
@@ -503,7 +500,7 @@ static void reiserfs_put_super(struct super_block *s)
 	kfree(s->s_fs_info);
 	s->s_fs_info = NULL;
 
-	unlock_kernel();
+	return;
 }
 
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -901,7 +898,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		{"conv",.setmask = 1 << REISERFS_CONVERT},
 		{"attrs",.setmask = 1 << REISERFS_ATTRS},
 		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
-		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
 		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
 		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
@@ -1197,7 +1193,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
-	lock_kernel();
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
@@ -1320,12 +1315,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
 	replace_mount_options(s, new_opts);
-	unlock_kernel();
 	return 0;
 
 out_err:
 	kfree(new_opts);
-	unlock_kernel();
 	return err;
 }
 
diff --git a/trunk/fs/reiserfs/xattr.c b/trunk/fs/reiserfs/xattr.c
index f3d47d856848..8e7deb0e6964 100644
--- a/trunk/fs/reiserfs/xattr.c
+++ b/trunk/fs/reiserfs/xattr.c
@@ -981,8 +981,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
 		REISERFS_SB(s)->priv_root = dentry;
-		if (!reiserfs_expose_privroot(s))
-			s->s_root->d_op = &xattr_lookup_poison_ops;
+		s->s_root->d_op = &xattr_lookup_poison_ops;
 		if (dentry->d_inode)
 			dentry->d_inode->i_flags |= S_PRIVATE;
 	} else
diff --git a/trunk/fs/smbfs/inode.c b/trunk/fs/smbfs/inode.c
index 1402d2d54f52..fc27fbfc5397 100644
--- a/trunk/fs/smbfs/inode.c
+++ b/trunk/fs/smbfs/inode.c
@@ -474,8 +474,6 @@ smb_put_super(struct super_block *sb)
 {
 	struct smb_sb_info *server = SMB_SB(sb);
 
-	lock_kernel();
-
 	smb_lock_server(server);
 	server->state = CONN_INVALID;
 	smbiod_unregister_server(server);
@@ -491,8 +489,6 @@ smb_put_super(struct super_block *sb)
 	smb_unlock_server(server);
 	put_pid(server->conn_pid);
 	kfree(server);
-
-	unlock_kernel();
 }
 
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/trunk/fs/squashfs/super.c b/trunk/fs/squashfs/super.c
index 3b52770f46ff..0adc624c956f 100644
--- a/trunk/fs/squashfs/super.c
+++ b/trunk/fs/squashfs/super.c
@@ -338,8 +338,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 
 static void squashfs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	if (sb->s_fs_info) {
 		struct squashfs_sb_info *sbi = sb->s_fs_info;
 		squashfs_cache_delete(sbi->block_cache);
@@ -352,8 +350,6 @@ static void squashfs_put_super(struct super_block *sb)
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
-
-	unlock_kernel();
 }
 
 
diff --git a/trunk/fs/super.c b/trunk/fs/super.c
index 83b47416d006..1943fdf655fa 100644
--- a/trunk/fs/super.c
+++ b/trunk/fs/super.c
@@ -28,6 +28,7 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
+#include <linux/buffer_head.h>		/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -37,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -70,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
+		INIT_LIST_HEAD(&s->s_async_list);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -282,6 +285,38 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.  Requires a second blkdev
+ * flush by the caller to complete the operation.
+ */
+void __fsync_super(struct super_block *sb)
+{
+	sync_inodes_sb(sb, 0);
+	vfs_dq_sync(sb);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	sync_inodes_sb(sb, 1);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	__fsync_super(sb);
+	return sync_blockdev(sb->s_bdev);
+}
+EXPORT_SYMBOL_GPL(fsync_super);
+
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -303,13 +338,21 @@ void generic_shutdown_super(struct super_block *sb)
 
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
-		sync_filesystem(sb);
-		get_fs_excl();
+		fsync_super(sb);
+		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
+		/*
+		 * wait for asynchronous fs operations to finish before going further
+		 */
+		async_synchronize_full_domain(&sb->s_async_list);
+
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
+		lock_kernel();
 
+		if (sop->write_super && sb->s_dirt)
+			sop->write_super(sb);
 		if (sop->put_super)
 			sop->put_super(sb);
 
@@ -319,7 +362,9 @@ void generic_shutdown_super(struct super_block *sb)
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
 		}
-		put_fs_excl();
+
+		unlock_kernel();
+		unlock_super(sb);
 	}
 	spin_lock(&sb_lock);
 	/* should be initialized for __put_super_and_need_restart() */
@@ -396,14 +441,16 @@ void drop_super(struct super_block *sb)
 
 EXPORT_SYMBOL(drop_super);
 
-/**
- * sync_supers - helper for periodic superblock writeback
- *
- * Call the write_super method if present on all dirty superblocks in
- * the system.  This is for the periodic writeback used by most older
- * filesystems.  For data integrity superblock writeback use
- * sync_filesystems() instead.
- *
+static inline void write_super(struct super_block *sb)
+{
+	lock_super(sb);
+	if (sb->s_root && sb->s_dirt)
+		if (sb->s_op->write_super)
+			sb->s_op->write_super(sb);
+	unlock_super(sb);
+}
+
+/*
  * Note: check the dirty flag before waiting, so we don't
  * hold up the sync while mounting a device. (The newly
  * mounted device won't need syncing.)
@@ -415,15 +462,12 @@ void sync_supers(void)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_op->write_super && sb->s_dirt) {
+		if (sb->s_dirt) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
-
 			down_read(&sb->s_umount);
-			if (sb->s_root && sb->s_dirt)
-				sb->s_op->write_super(sb);
+			write_super(sb);
 			up_read(&sb->s_umount);
-
 			spin_lock(&sb_lock);
 			if (__put_super_and_need_restart(sb))
 				goto restart;
@@ -432,6 +476,60 @@ void sync_supers(void)
 	spin_unlock(&sb_lock);
 }
 
+/*
+ * Call the ->sync_fs super_op against all filesystems which are r/w and
+ * which implement it.
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync_fs
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ *
+ * (Fabian) Avoid sync_fs with clean fs & wait mode 0
+ */
+void sync_filesystems(int wait)
+{
+	struct super_block *sb;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);		/* Could be down_interruptible */
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_op->sync_fs)
+			continue;
+		if (sb->s_flags & MS_RDONLY)
+			continue;
+		sb->s_need_sync_fs = 1;
+	}
+
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_need_sync_fs)
+			continue;
+		sb->s_need_sync_fs = 0;
+		if (sb->s_flags & MS_RDONLY)
+			continue;	/* hm.  Was remounted r/o meanwhile */
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		async_synchronize_full_domain(&sb->s_async_list);
+		if (sb->s_root && (wait || sb->s_dirt))
+			sb->s_op->sync_fs(sb, wait);
+		up_read(&sb->s_umount);
+		/* restart only when sb is no longer on the list */
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	mutex_unlock(&mutex);
+}
+
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
@@ -517,6 +615,45 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 	return err;
 }
 
+/**
+ *	mark_files_ro - mark all files read-only
+ *	@sb: superblock in question
+ *
+ *	All files are marked read-only.  We don't care about pending
+ *	delete files so this should be used in 'force' mode only.
+ */
+
+static void mark_files_ro(struct super_block *sb)
+{
+	struct file *f;
+
+retry:
+	file_list_lock();
+	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+		struct vfsmount *mnt;
+		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+		       continue;
+		if (!file_count(f))
+			continue;
+		if (!(f->f_mode & FMODE_WRITE))
+			continue;
+		f->f_mode &= ~FMODE_WRITE;
+		if (file_check_writeable(f) != 0)
+			continue;
+		file_release_write(f);
+		mnt = mntget(f->f_path.mnt);
+		file_list_unlock();
+		/*
+		 * This can sleep, so we can't hold
+		 * the file_list_lock() spinlock.
+		 */
+		mnt_drop_write(mnt);
+		mntput(mnt);
+		goto retry;
+	}
+	file_list_unlock();
+}
+
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
@@ -538,31 +675,27 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
-	sync_filesystem(sb);
+	fsync_super(sb);
 
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
 		if (force)
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb)) {
-			unlock_kernel();
+		else if (!fs_may_remount_ro(sb))
 			return -EBUSY;
-		}
 		retval = vfs_dq_off(sb, 1);
-		if (retval < 0 && retval != -ENOSYS) {
-			unlock_kernel();
+		if (retval < 0 && retval != -ENOSYS)
 			return -EBUSY;
-		}
 	}
 	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
+		lock_super(sb);
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval) {
-			unlock_kernel();
+		unlock_super(sb);
+		if (retval)
 			return retval;
-		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 	if (remount_rw)
@@ -578,17 +711,18 @@ static void do_emergency_remount(struct work_struct *work)
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		sb->s_count++;
 		spin_unlock(&sb_lock);
-		down_write(&sb->s_umount);
+		down_read(&sb->s_umount);
 		if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
 			/*
 			 * ->remount_fs needs lock_kernel().
 			 *
 			 * What lock protects sb->s_flags??
 			 */
+			lock_kernel();
 			do_remount_sb(sb, MS_RDONLY, NULL, 1);
+			unlock_kernel();
 		}
-		up_write(&sb->s_umount);
-		put_super(sb);
+		drop_super(sb);
 		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
diff --git a/trunk/fs/sync.c b/trunk/fs/sync.c
index dd200025af85..7abc65fbf21d 100644
--- a/trunk/fs/sync.c
+++ b/trunk/fs/sync.c
@@ -13,123 +13,38 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
-#include "internal.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
 /*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static void do_sync(unsigned long wait)
 {
-	/* Avoid doing twice syncing and cache pruning for quota sync */
+	wakeup_pdflush(0);
+	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
+	vfs_dq_sync(NULL);
+	sync_supers();		/* Write the superblocks */
+	sync_filesystems(0);	/* Start syncing the filesystems */
+	sync_filesystems(wait);	/* Waitingly sync the filesystems */
+	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
 	if (!wait)
-		writeout_quota_sb(sb, -1);
-	else
-		sync_quota_sb(sb, -1);
-	sync_inodes_sb(sb, wait);
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, wait);
-	return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int sync_filesystem(struct super_block *sb)
-{
-	int ret;
-
-	/*
-	 * We need to be protected against the filesystem going from
-	 * r/o to r/w or vice versa.
-	 */
-	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-	/*
-	 * No point in syncing out anything if the filesystem is read-only.
-	 */
-	if (sb->s_flags & MS_RDONLY)
-		return 0;
-
-	ret = __sync_filesystem(sb, 0);
-	if (ret < 0)
-		return ret;
-	return __sync_filesystem(sb, 1);
-}
-EXPORT_SYMBOL_GPL(sync_filesystem);
-
-/*
- * Sync all the data for all the filesystems (called by sys_sync() and
- * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- */
-static void sync_filesystems(int wait)
-{
-	struct super_block *sb;
-	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);		/* Could be down_interruptible */
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list)
-		sb->s_need_sync = 1;
-
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync)
-			continue;
-		sb->s_need_sync = 0;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-
-		down_read(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
-			__sync_filesystem(sb, wait);
-		up_read(&sb->s_umount);
-
-		/* restart only when sb is no longer on the list */
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-	mutex_unlock(&mutex);
+		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
 }
 
 SYSCALL_DEFINE0(sync)
 {
-	sync_filesystems(0);
-	sync_filesystems(1);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
+	do_sync(1);
 	return 0;
 }
 
 static void do_sync_work(struct work_struct *work)
 {
-	/*
-	 * Sync twice to reduce the possibility we skipped some inodes / pages
-	 * because they were temporarily locked
-	 */
-	sync_filesystems(0);
-	sync_filesystems(0);
-	printk("Emergency Sync complete\n");
+	do_sync(0);
 	kfree(work);
 }
 
@@ -160,8 +75,10 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 
 	/* sync the superblock to buffers */
 	sb = inode->i_sb;
+	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
+	unlock_super(sb);
 
 	/* .. finally sync the buffers to disk */
 	err = sync_blockdev(sb->s_bdev);
diff --git a/trunk/fs/sysv/dir.c b/trunk/fs/sysv/dir.c
index c7798079e644..56f655254bfe 100644
--- a/trunk/fs/sysv/dir.c
+++ b/trunk/fs/sysv/dir.c
@@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= sysv_sync_file,
 };
 
 static inline void dir_put_page(struct page *page)
diff --git a/trunk/fs/sysv/file.c b/trunk/fs/sysv/file.c
index 96340c01f4a7..589be21d884e 100644
--- a/trunk/fs/sysv/file.c
+++ b/trunk/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_fsync,
+	.fsync		= sysv_sync_file,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -34,3 +34,18 @@ const struct inode_operations sysv_file_inode_operations = {
 	.truncate	= sysv_truncate,
 	.getattr	= sysv_getattr,
 };
+
+int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return err;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return err;
+	
+	err |= sysv_sync_inode(inode);
+	return err ? -EIO : 0;
+}
diff --git a/trunk/fs/sysv/inode.c b/trunk/fs/sysv/inode.c
index 479923456a54..da20b48d350f 100644
--- a/trunk/fs/sysv/inode.c
+++ b/trunk/fs/sysv/inode.c
@@ -31,13 +31,15 @@
 #include <asm/byteorder.h>
 #include "sysv.h"
 
-static int sysv_sync_fs(struct super_block *sb, int wait)
+/* This is only called on sync() and umount(), when s_dirt=1. */
+static void sysv_write_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 	unsigned long time = get_seconds(), old_time;
 
-	lock_super(sb);
 	lock_kernel();
+	if (sb->s_flags & MS_RDONLY)
+		goto clean;
 
 	/*
 	 * If we are going to write out the super block,
@@ -51,30 +53,18 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
 		*sbi->s_sb_time = cpu_to_fs32(sbi, time);
 		mark_buffer_dirty(sbi->s_bh2);
 	}
-
+clean:
+	sb->s_dirt = 0;
 	unlock_kernel();
-	unlock_super(sb);
-
-	return 0;
-}
-
-static void sysv_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		sysv_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
 }
 
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
-	lock_super(sb);
 	if (sbi->s_forced_ro)
 		*flags |= MS_RDONLY;
 	if (!(*flags & MS_RDONLY))
 		sb->s_dirt = 1;
-	unlock_super(sb);
 	return 0;
 }
 
@@ -82,11 +72,6 @@ static void sysv_put_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
-	lock_kernel();
-
-	if (sb->s_dirt)
-		sysv_write_super(sb);
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		/* XXX ext2 also updates the state here */
 		mark_buffer_dirty(sbi->s_bh1);
@@ -99,8 +84,6 @@ static void sysv_put_super(struct super_block *sb)
 		brelse(sbi->s_bh2);
 
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -253,7 +236,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
 	return ERR_PTR(-EIO);
 }
 
-int sysv_write_inode(struct inode *inode, int wait)
+static struct buffer_head * sysv_update_inode(struct inode * inode)
 {
 	struct super_block * sb = inode->i_sb;
 	struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -261,21 +244,19 @@ int sysv_write_inode(struct inode *inode, int wait)
 	struct sysv_inode * raw_inode;
 	struct sysv_inode_info * si;
 	unsigned int ino, block;
-	int err = 0;
 
 	ino = inode->i_ino;
 	if (!ino || ino > sbi->s_ninodes) {
 		printk("Bad inode number on dev %s: %d is out of range\n",
 		       inode->i_sb->s_id, ino);
-		return -EIO;
+		return NULL;
 	}
 	raw_inode = sysv_raw_inode(sb, ino, &bh);
 	if (!raw_inode) {
 		printk("unable to read i-node block\n");
-		return -EIO;
+		return NULL;
 	}
 
-	lock_kernel();
 	raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
 	raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
 	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -291,23 +272,38 @@ int sysv_write_inode(struct inode *inode, int wait)
 	for (block = 0; block < 10+1+1+1; block++)
 		write3byte(sbi, (u8 *)&si->i_data[block],
 			&raw_inode->i_data[3*block]);
-	unlock_kernel();
 	mark_buffer_dirty(bh);
-	if (wait) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        printk ("IO error syncing sysv inode [%s:%08x]\n",
-                                sb->s_id, ino);
-                        err = -EIO;
-                }
-        }
+	return bh;
+}
+
+int sysv_write_inode(struct inode * inode, int wait)
+{
+	struct buffer_head *bh;
+	lock_kernel();
+	bh = sysv_update_inode(inode);
 	brelse(bh);
+	unlock_kernel();
 	return 0;
 }
 
-int sysv_sync_inode(struct inode *inode)
+int sysv_sync_inode(struct inode * inode)
 {
-	return sysv_write_inode(inode, 1);
+        int err = 0;
+        struct buffer_head *bh;
+
+        bh = sysv_update_inode(inode);
+        if (bh && buffer_dirty(bh)) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk ("IO error syncing sysv inode [%s:%08lx]\n",
+                                inode->i_sb->s_id, inode->i_ino);
+                        err = -1;
+                }
+        }
+        else if (!bh)
+                err = -1;
+        brelse (bh);
+        return err;
 }
 
 static void sysv_delete_inode(struct inode *inode)
@@ -351,7 +347,6 @@ const struct super_operations sysv_sops = {
 	.delete_inode	= sysv_delete_inode,
 	.put_super	= sysv_put_super,
 	.write_super	= sysv_write_super,
-	.sync_fs	= sysv_sync_fs,
 	.remount_fs	= sysv_remount,
 	.statfs		= sysv_statfs,
 };
diff --git a/trunk/fs/sysv/sysv.h b/trunk/fs/sysv/sysv.h
index 53786eb5cf60..5784a318c883 100644
--- a/trunk/fs/sysv/sysv.h
+++ b/trunk/fs/sysv/sysv.h
@@ -144,6 +144,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, int);
 extern int sysv_sync_inode(struct inode *);
+extern int sysv_sync_file(struct file *, struct dentry *, int);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int sysv_init_icache(void);
diff --git a/trunk/fs/ubifs/super.c b/trunk/fs/ubifs/super.c
index 3589eab02a2f..e9f7a754c4f7 100644
--- a/trunk/fs/ubifs/super.c
+++ b/trunk/fs/ubifs/super.c
@@ -36,7 +36,6 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
-#include <linux/smp_lock.h>
 #include "ubifs.h"
 
 /*
@@ -448,6 +447,9 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	if (!wait)
 		return 0;
 
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
 	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
@@ -1685,9 +1687,6 @@ static void ubifs_put_super(struct super_block *sb)
 
 	ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
 		  c->vi.vol_id);
-
-	lock_kernel();
-
 	/*
 	 * The following asserts are only valid if there has not been a failure
 	 * of the media. For example, there will be dirty inodes if we failed
@@ -1754,8 +1753,6 @@ static void ubifs_put_super(struct super_block *sb)
 	ubi_close_volume(c->ubi);
 	mutex_unlock(&c->umount_mutex);
 	kfree(c);
-
-	unlock_kernel();
 }
 
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1771,22 +1768,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
-	lock_kernel();
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
-			unlock_kernel();
 			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
-		if (err) {
-			unlock_kernel();
+		if (err)
 			return err;
-		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
-			unlock_kernel();
 			return -EROFS;
 		}
 		ubifs_remount_ro(c);
@@ -1801,7 +1793,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	ubifs_assert(c->lst.taken_empty_lebs > 0);
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/trunk/fs/udf/Makefile b/trunk/fs/udf/Makefile
index eb880f66c23a..0d4503f7446d 100644
--- a/trunk/fs/udf/Makefile
+++ b/trunk/fs/udf/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UDF_FS) += udf.o
 
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
-		partition.o super.o truncate.o symlink.o \
+		partition.o super.o truncate.o symlink.o fsync.o \
 		directory.o misc.o udftime.o unicode.o
diff --git a/trunk/fs/udf/dir.c b/trunk/fs/udf/dir.c
index 61d9a76a3a69..2efd4d5291b6 100644
--- a/trunk/fs/udf/dir.c
+++ b/trunk/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
 	.ioctl			= udf_ioctl,
-	.fsync			= simple_fsync,
+	.fsync			= udf_fsync_file,
 };
diff --git a/trunk/fs/udf/file.c b/trunk/fs/udf/file.c
index 7464305382b5..eb91f3b70320 100644
--- a/trunk/fs/udf/file.c
+++ b/trunk/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
 	.write			= do_sync_write,
 	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
-	.fsync			= simple_fsync,
+	.fsync			= udf_fsync_file,
 	.splice_read		= generic_file_splice_read,
 	.llseek			= generic_file_llseek,
 };
diff --git a/trunk/fs/udf/fsync.c b/trunk/fs/udf/fsync.c
new file mode 100644
index 000000000000..b2c472b733b8
--- /dev/null
+++ b/trunk/fs/udf/fsync.c
@@ -0,0 +1,52 @@
+/*
+ * fsync.c
+ *
+ * PURPOSE
+ *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *      ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2001 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  05/22/99 blf  Created.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/fs.h>
+
+static int udf_fsync_inode(struct inode *, int);
+
+/*
+ *	File may be NULL when we are called. Perhaps we shouldn't
+ *	even pass file to fsync ?
+ */
+
+int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+
+	return udf_fsync_inode(inode, datasync);
+}
+
+static int udf_fsync_inode(struct inode *inode, int datasync)
+{
+	int err;
+
+	err = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return err;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return err;
+
+	err |= udf_sync_inode(inode);
+
+	return err ? -EIO : 0;
+}
diff --git a/trunk/fs/udf/super.c b/trunk/fs/udf/super.c
index 6832135159b6..0ba44107d8f1 100644
--- a/trunk/fs/udf/super.c
+++ b/trunk/fs/udf/super.c
@@ -568,7 +568,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	if (!udf_parse_options(options, &uopt, true))
 		return -EINVAL;
 
-	lock_kernel();
 	sbi->s_flags = uopt.flags;
 	sbi->s_uid   = uopt.uid;
 	sbi->s_gid   = uopt.gid;
@@ -582,16 +581,13 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 			*flags |= MS_RDONLY;
 	}
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
-	}
 	if (*flags & MS_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
 
-	unlock_kernel();
 	return 0;
 }
 
@@ -2066,9 +2062,6 @@ static void udf_put_super(struct super_block *sb)
 	struct udf_sb_info *sbi;
 
 	sbi = UDF_SB(sb);
-
-	lock_kernel();
-
 	if (sbi->s_vat_inode)
 		iput(sbi->s_vat_inode);
 	if (sbi->s_partitions)
@@ -2084,8 +2077,6 @@ static void udf_put_super(struct super_block *sb)
 	kfree(sbi->s_partmaps);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/trunk/fs/udf/udfdecl.h b/trunk/fs/udf/udfdecl.h
index 8d46f4294ee7..cac51b77a5d1 100644
--- a/trunk/fs/udf/udfdecl.h
+++ b/trunk/fs/udf/udfdecl.h
@@ -223,6 +223,9 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
 			 uint32_t, int *);
 
+/* fsync.c */
+extern int udf_fsync_file(struct file *, struct dentry *, int);
+
 /* directory.c */
 extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
 						struct udf_fileident_bh *,
diff --git a/trunk/fs/ufs/dir.c b/trunk/fs/ufs/dir.c
index 6f671f1ac271..6321b797061b 100644
--- a/trunk/fs/ufs/dir.c
+++ b/trunk/fs/ufs/dir.c
@@ -666,6 +666,6 @@ int ufs_empty_dir(struct inode * inode)
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= ufs_sync_file,
 	.llseek		= generic_file_llseek,
 };
diff --git a/trunk/fs/ufs/file.c b/trunk/fs/ufs/file.c
index 73655c61240a..2bd3a1615714 100644
--- a/trunk/fs/ufs/file.c
+++ b/trunk/fs/ufs/file.c
@@ -24,10 +24,31 @@
  */
 
 #include <linux/fs.h>
+#include <linux/buffer_head.h>	/* for sync_mapping_buffers() */
 
 #include "ufs_fs.h"
 #include "ufs.h"
 
+
+int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = ufs_sync_inode(inode);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+
+
 /*
  * We have mostly NULL's here: the current defaults are ok for
  * the ufs filesystem.
@@ -41,6 +62,6 @@ const struct file_operations ufs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
-	.fsync		= simple_fsync,
+	.fsync		= ufs_sync_file,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/trunk/fs/ufs/super.c b/trunk/fs/ufs/super.c
index 5faed7954d0a..60359291761f 100644
--- a/trunk/fs/ufs/super.c
+++ b/trunk/fs/ufs/super.c
@@ -263,7 +263,6 @@ void ufs_panic (struct super_block * sb, const char * function,
 	struct ufs_super_block_first * usb1;
 	va_list args;
 	
-	lock_kernel();
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
@@ -595,9 +594,6 @@ static void ufs_put_super_internal(struct super_block *sb)
 
 	
 	UFSD("ENTER\n");
-
-	lock_kernel();
-
 	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -625,9 +621,6 @@ static void ufs_put_super_internal(struct super_block *sb)
 		brelse (sbi->s_ucg[i]);
 	kfree (sbi->s_ucg);
 	kfree (base);
-
-	unlock_kernel();
-
 	UFSD("EXIT\n");
 }
 
@@ -1125,45 +1118,32 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	return -ENOMEM;
 }
 
-static int ufs_sync_fs(struct super_block *sb, int wait)
+static void ufs_write_super(struct super_block *sb)
 {
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_third * usb3;
 	unsigned flags;
 
-	lock_super(sb);
 	lock_kernel();
-
 	UFSD("ENTER\n");
-
 	flags = UFS_SB(sb)->s_flags;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	usb3 = ubh_get_usb_third(uspi);
 
-	usb1->fs_time = cpu_to_fs32(sb, get_seconds());
-	if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
-	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
-	    (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-		ufs_set_fs_state(sb, usb1, usb3,
-				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-	ufs_put_cstotal(sb);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+		if ((flags & UFS_ST_MASK) == UFS_ST_SUN 
+		  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
+		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+			ufs_set_fs_state(sb, usb1, usb3,
+					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+		ufs_put_cstotal(sb);
+	}
 	sb->s_dirt = 0;
-
 	UFSD("EXIT\n");
 	unlock_kernel();
-	unlock_super(sb);
-
-	return 0;
-}
-
-static void ufs_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		ufs_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
 }
 
 static void ufs_put_super(struct super_block *sb)
@@ -1172,9 +1152,6 @@ static void ufs_put_super(struct super_block *sb)
 		
 	UFSD("ENTER\n");
 
-	if (sb->s_dirt)
-		ufs_write_super(sb);
-
 	if (!(sb->s_flags & MS_RDONLY))
 		ufs_put_super_internal(sb);
 	
@@ -1194,9 +1171,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	struct ufs_super_block_third * usb3;
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
-
-	lock_kernel();
-	lock_super(sb);
+	
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
 	usb1 = ubh_get_usb_first(uspi);
@@ -1209,24 +1184,17 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
 	new_mount_opt = 0;
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
-	if (!ufs_parse_options (data, &new_mount_opt)) {
-		unlock_super(sb);
-		unlock_kernel();
+	if (!ufs_parse_options (data, &new_mount_opt))
 		return -EINVAL;
-	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
 		new_mount_opt |= ufstype;
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
-		unlock_super(sb);
-		unlock_kernel();
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
-		unlock_super(sb);
-		unlock_kernel();
 		return 0;
 	}
 	
@@ -1251,8 +1219,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
-		unlock_super(sb);
-		unlock_kernel();
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1261,22 +1227,16 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
-			unlock_super(sb);
-			unlock_kernel();
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
-			unlock_super(sb);
-			unlock_kernel();
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
 #endif
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
-	unlock_super(sb);
-	unlock_kernel();
 	return 0;
 }
 
@@ -1392,7 +1352,6 @@ static const struct super_operations ufs_super_ops = {
 	.delete_inode	= ufs_delete_inode,
 	.put_super	= ufs_put_super,
 	.write_super	= ufs_write_super,
-	.sync_fs	= ufs_sync_fs,
 	.statfs		= ufs_statfs,
 	.remount_fs	= ufs_remount,
 	.show_options   = ufs_show_options,
diff --git a/trunk/fs/ufs/ufs.h b/trunk/fs/ufs/ufs.h
index 644e77e13599..d0c4acd4f1f3 100644
--- a/trunk/fs/ufs/ufs.h
+++ b/trunk/fs/ufs/ufs.h
@@ -99,6 +99,7 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
+extern int ufs_sync_file(struct file *, struct dentry *, int);
 
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/trunk/fs/xattr.c b/trunk/fs/xattr.c
index 1c3d0af59ddf..d51b8f9db921 100644
--- a/trunk/fs/xattr.c
+++ b/trunk/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write_file(f);
+	error = mnt_want_write(f->f_path.mnt);
 	if (!error) {
 		error = setxattr(dentry, name, value, size, flags);
 		mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write_file(f);
+	error = mnt_want_write(f->f_path.mnt);
 	if (!error) {
 		error = removexattr(dentry, name);
 		mnt_drop_write(f->f_path.mnt);
diff --git a/trunk/fs/xfs/linux-2.6/xfs_super.c b/trunk/fs/xfs/linux-2.6/xfs_super.c
index 08d6bd9a3947..bb685269f832 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_super.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_super.c
@@ -1104,6 +1104,15 @@ xfs_fs_put_super(
 	kfree(mp);
 }
 
+STATIC void
+xfs_fs_write_super(
+	struct super_block	*sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		xfs_sync_fsdata(XFS_M(sb), 0);
+	sb->s_dirt = 0;
+}
+
 STATIC int
 xfs_fs_sync_super(
 	struct super_block	*sb,
@@ -1128,6 +1137,7 @@ xfs_fs_sync_super(
 		error = xfs_quiesce_data(mp);
 	else
 		error = xfs_sync_fsdata(mp, 0);
+	sb->s_dirt = 0;
 
 	if (unlikely(laptop_mode)) {
 		int	prev_sync_seq = mp->m_sync_seq;
@@ -1433,6 +1443,7 @@ xfs_fs_fill_super(
 
 	XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
 
+	sb->s_dirt = 1;
 	sb->s_magic = XFS_SB_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
 	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1522,6 +1533,7 @@ static struct super_operations xfs_super_operations = {
 	.write_inode		= xfs_fs_write_inode,
 	.clear_inode		= xfs_fs_clear_inode,
 	.put_super		= xfs_fs_put_super,
+	.write_super		= xfs_fs_write_super,
 	.sync_fs		= xfs_fs_sync_super,
 	.freeze_fs		= xfs_fs_freeze,
 	.statfs			= xfs_fs_statfs,
diff --git a/trunk/fs/xfs/xfs_trans.c b/trunk/fs/xfs/xfs_trans.c
index bcc39d358ad3..8570b826fedd 100644
--- a/trunk/fs/xfs/xfs_trans.c
+++ b/trunk/fs/xfs/xfs_trans.c
@@ -628,6 +628,8 @@ xfs_trans_apply_sb_deltas(
 		xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
 				  offsetof(xfs_dsb_t, sb_frextents) +
 				  sizeof(sbp->sb_frextents) - 1);
+
+	tp->t_mountp->m_super->s_dirt = 1;
 }
 
 /*
diff --git a/trunk/include/linux/Kbuild b/trunk/include/linux/Kbuild
index b3afd2219ad2..3f0eaa397ef5 100644
--- a/trunk/include/linux/Kbuild
+++ b/trunk/include/linux/Kbuild
@@ -135,7 +135,6 @@ header-y += posix_types.h
 header-y += ppdev.h
 header-y += prctl.h
 header-y += qnxtypes.h
-header-y += qnx4_fs.h
 header-y += radeonfb.h
 header-y += raw.h
 header-y += resource.h
@@ -309,6 +308,7 @@ unifdef-y += poll.h
 unifdef-y += ppp_defs.h
 unifdef-y += ppp-comp.h
 unifdef-y += ptrace.h
+unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
 unifdef-y += irqnr.h
diff --git a/trunk/include/linux/cdev.h b/trunk/include/linux/cdev.h
index f389e319a454..fb4591977b03 100644
--- a/trunk/include/linux/cdev.h
+++ b/trunk/include/linux/cdev.h
@@ -28,8 +28,6 @@ int cdev_add(struct cdev *, dev_t, unsigned);
 
 void cdev_del(struct cdev *);
 
-int cdev_index(struct inode *inode);
-
 void cd_forget(struct inode *);
 
 extern struct backing_dev_info directly_mappable_cdev_bdi;
diff --git a/trunk/include/linux/cramfs_fs.h b/trunk/include/linux/cramfs_fs.h
index 6fc2bed368b8..3be4e5a27d82 100644
--- a/trunk/include/linux/cramfs_fs.h
+++ b/trunk/include/linux/cramfs_fs.h
@@ -2,8 +2,9 @@
 #define __CRAMFS_H
 
 #include <linux/types.h>
-#include <linux/magic.h>
 
+#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
+#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define CRAMFS_SIGNATURE	"Compressed ROMFS"
 
 /*
diff --git a/trunk/include/linux/dcache.h b/trunk/include/linux/dcache.h
index 30b93b2a01a4..15156364d196 100644
--- a/trunk/include/linux/dcache.h
+++ b/trunk/include/linux/dcache.h
@@ -180,12 +180,10 @@ d_iput:		no		no		no       yes
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
 #define DCACHE_UNHASHED		0x0010	
 
-#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched by inotify */
+#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
 
 #define DCACHE_COOKIE		0x0040	/* For use by dcookie subsystem */
 
-#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x0080 /* Parent inode is watched by some fsnotify listener */
-
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
@@ -353,11 +351,6 @@ static inline int d_unhashed(struct dentry *dentry)
 	return (dentry->d_flags & DCACHE_UNHASHED);
 }
 
-static inline int d_unlinked(struct dentry *dentry)
-{
-	return d_unhashed(dentry) && !IS_ROOT(dentry);
-}
-
 static inline struct dentry *dget_parent(struct dentry *dentry)
 {
 	struct dentry *ret;
@@ -375,7 +368,7 @@ static inline int d_mountpoint(struct dentry *dentry)
 	return dentry->d_mounted;
 }
 
-extern struct vfsmount *lookup_mnt(struct path *);
+extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
diff --git a/trunk/include/linux/dnotify.h b/trunk/include/linux/dnotify.h
index ecc06286226d..102a902b4396 100644
--- a/trunk/include/linux/dnotify.h
+++ b/trunk/include/linux/dnotify.h
@@ -10,7 +10,7 @@
 
 struct dnotify_struct {
 	struct dnotify_struct *	dn_next;
-	__u32			dn_mask;
+	unsigned long		dn_mask;
 	int			dn_fd;
 	struct file *		dn_filp;
 	fl_owner_t		dn_owner;
@@ -21,18 +21,23 @@ struct dnotify_struct {
 
 #ifdef CONFIG_DNOTIFY
 
-#define DNOTIFY_ALL_EVENTS (FS_DELETE | FS_DELETE_CHILD |\
-			    FS_MODIFY | FS_MODIFY_CHILD |\
-			    FS_ACCESS | FS_ACCESS_CHILD |\
-			    FS_ATTRIB | FS_ATTRIB_CHILD |\
-			    FS_CREATE | FS_DN_RENAME |\
-			    FS_MOVED_FROM | FS_MOVED_TO)
-
+extern void __inode_dir_notify(struct inode *, unsigned long);
 extern void dnotify_flush(struct file *, fl_owner_t);
 extern int fcntl_dirnotify(int, struct file *, unsigned long);
+extern void dnotify_parent(struct dentry *, unsigned long);
+
+static inline void inode_dir_notify(struct inode *inode, unsigned long event)
+{
+	if (inode->i_dnotify_mask & (event))
+		__inode_dir_notify(inode, event);
+}
 
 #else
 
+static inline void __inode_dir_notify(struct inode *inode, unsigned long event)
+{
+}
+
 static inline void dnotify_flush(struct file *filp, fl_owner_t id)
 {
 }
@@ -42,6 +47,14 @@ static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	return -EINVAL;
 }
 
+static inline void dnotify_parent(struct dentry *dentry, unsigned long event)
+{
+}
+
+static inline void inode_dir_notify(struct inode *inode, unsigned long event)
+{
+}
+
 #endif /* CONFIG_DNOTIFY */
 
 #endif /* __KERNEL __ */
diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h
index ede84fa7da5d..83d6b4397245 100644
--- a/trunk/include/linux/fs.h
+++ b/trunk/include/linux/fs.h
@@ -729,8 +729,8 @@ struct inode {
 	struct timespec		i_atime;
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
-	blkcnt_t		i_blocks;
 	unsigned int		i_blkbits;
+	blkcnt_t		i_blocks;
 	unsigned short          i_bytes;
 	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
@@ -751,12 +751,13 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 	};
+	int			i_cindex;
 
 	__u32			i_generation;
 
-#ifdef CONFIG_FSNOTIFY
-	__u32			i_fsnotify_mask; /* all events this inode cares about */
-	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
+#ifdef CONFIG_DNOTIFY
+	unsigned long		i_dnotify_mask; /* Directory notify events */
+	struct dnotify_struct	*i_dnotify; /* for directory notifications */
 #endif
 
 #ifdef CONFIG_INOTIFY
@@ -1320,7 +1321,7 @@ struct super_block {
 	struct rw_semaphore	s_umount;
 	struct mutex		s_lock;
 	int			s_count;
-	int			s_need_sync;
+	int			s_need_sync_fs;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
@@ -1371,6 +1372,11 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char *s_options;
+
+	/*
+	 * storage for asynchronous operations
+	 */
+	struct list_head s_async_list;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
@@ -1794,7 +1800,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
-extern struct vfsmount *collect_mounts(struct path *);
+extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
 extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
@@ -1941,6 +1947,8 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
+extern int fsync_super(struct super_block *);
+extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@ -1956,7 +1964,6 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
-extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
@@ -2075,8 +2082,12 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
+extern void sync_filesystems(int wait);
+extern void __fsync_super(struct super_block *sb);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
+extern int do_remount_sb(struct super_block *sb, int flags,
+			 void *data, int force);
 #ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
 #endif
@@ -2345,8 +2356,6 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
 
-extern int simple_fsync(struct file *, struct dentry *, int);
-
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *);
diff --git a/trunk/include/linux/fsnotify.h b/trunk/include/linux/fsnotify.h
index 936f9aa8bb97..00fbd5b245c9 100644
--- a/trunk/include/linux/fsnotify.h
+++ b/trunk/include/linux/fsnotify.h
@@ -13,7 +13,6 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
-#include <linux/fsnotify_backend.h>
 #include <linux/audit.h>
 
 /*
@@ -23,44 +22,18 @@
 static inline void fsnotify_d_instantiate(struct dentry *entry,
 						struct inode *inode)
 {
-	__fsnotify_d_instantiate(entry, inode);
-
 	inotify_d_instantiate(entry, inode);
 }
 
-/* Notify this dentry's parent about a child's events. */
-static inline void fsnotify_parent(struct dentry *dentry, __u32 mask)
-{
-	__fsnotify_parent(dentry, mask);
-
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
-}
-
 /*
  * fsnotify_d_move - entry has been moved
  * Called with dcache_lock and entry->d_lock held.
  */
 static inline void fsnotify_d_move(struct dentry *entry)
 {
-	/*
-	 * On move we need to update entry->d_flags to indicate if the new parent
-	 * cares about events from this entry.
-	 */
-	__fsnotify_update_dcache_flags(entry);
-
 	inotify_d_move(entry);
 }
 
-/*
- * fsnotify_link_count - inode's link count changed
- */
-static inline void fsnotify_link_count(struct inode *inode)
-{
-	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
-
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-}
-
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
@@ -69,62 +42,42 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
-	u32 in_cookie = inotify_get_cookie();
-	u32 fs_cookie = fsnotify_get_cookie();
-	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
-	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
+	u32 cookie = inotify_get_cookie();
 
 	if (old_dir == new_dir)
-		old_dir_mask |= FS_DN_RENAME;
-
-	if (isdir) {
-		isdir = IN_ISDIR;
-		old_dir_mask |= FS_IN_ISDIR;
-		new_dir_mask |= FS_IN_ISDIR;
+		inode_dir_notify(old_dir, DN_RENAME);
+	else {
+		inode_dir_notify(old_dir, DN_DELETE);
+		inode_dir_notify(new_dir, DN_CREATE);
 	}
 
-	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
+	if (isdir)
+		isdir = IN_ISDIR;
+	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
 				  source);
-	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
+	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
-
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
 		inotify_inode_is_dead(target);
-
-		/* this is really a link_count change not a removal */
-		fsnotify_link_count(target);
 	}
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
 
-/*
- * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
- */
-static inline void fsnotify_inode_delete(struct inode *inode)
-{
-	__fsnotify_inode_delete(inode);
-}
-
 /*
  * fsnotify_nameremove - a filename was removed from a directory
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 {
-	__u32 mask = FS_DELETE;
-
 	if (isdir)
-		mask |= FS_IN_ISDIR;
-
-	fsnotify_parent(dentry, mask);
+		isdir = IN_ISDIR;
+	dnotify_parent(dentry, DN_DELETE);
+	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
 }
 
 /*
@@ -134,9 +87,14 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
+}
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-	__fsnotify_inode_delete(inode);
+/*
+ * fsnotify_link_count - inode's link count changed
+ */
+static inline void fsnotify_link_count(struct inode *inode)
+{
+	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 }
 
 /*
@@ -144,11 +102,10 @@ static inline void fsnotify_inoderemove(struct inode *inode)
  */
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
+	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
-
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -158,12 +115,11 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry)
 {
+	inode_dir_notify(dir, DN_CREATE);
 	inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
 				  inode);
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
-
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
 }
 
 /*
@@ -171,13 +127,10 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
  */
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
-	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
-	struct inode *d_inode = dentry->d_inode;
-
-	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
+	inode_dir_notify(inode, DN_CREATE);
+	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
+				  dentry->d_name.name, dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
-
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -186,15 +139,14 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 static inline void fsnotify_access(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	__u32 mask = FS_ACCESS;
+	u32 mask = IN_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= IN_ISDIR;
 
+	dnotify_parent(dentry, DN_ACCESS);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
-	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -203,15 +155,14 @@ static inline void fsnotify_access(struct dentry *dentry)
 static inline void fsnotify_modify(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	__u32 mask = FS_MODIFY;
+	u32 mask = IN_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= IN_ISDIR;
 
+	dnotify_parent(dentry, DN_MODIFY);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
-	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -220,15 +171,13 @@ static inline void fsnotify_modify(struct dentry *dentry)
 static inline void fsnotify_open(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	__u32 mask = FS_OPEN;
+	u32 mask = IN_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= IN_ISDIR;
 
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
-	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -238,16 +187,15 @@ static inline void fsnotify_close(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
-	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
+	u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= IN_ISDIR;
 
+	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
-	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
@@ -256,15 +204,13 @@ static inline void fsnotify_close(struct file *file)
 static inline void fsnotify_xattr(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	__u32 mask = FS_ATTRIB;
+	u32 mask = IN_ATTRIB;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= IN_ISDIR;
 
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
-	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -274,37 +220,50 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
-	__u32 mask = 0;
-
-	if (ia_valid & ATTR_UID)
-		mask |= FS_ATTRIB;
-	if (ia_valid & ATTR_GID)
-		mask |= FS_ATTRIB;
-	if (ia_valid & ATTR_SIZE)
-		mask |= FS_MODIFY;
+	int dn_mask = 0;
+	u32 in_mask = 0;
 
+	if (ia_valid & ATTR_UID) {
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	}
+	if (ia_valid & ATTR_GID) {
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		in_mask |= IN_MODIFY;
+		dn_mask |= DN_MODIFY;
+	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
-		mask |= FS_ATTRIB;
-	else if (ia_valid & ATTR_ATIME)
-		mask |= FS_ACCESS;
-	else if (ia_valid & ATTR_MTIME)
-		mask |= FS_MODIFY;
-
-	if (ia_valid & ATTR_MODE)
-		mask |= FS_ATTRIB;
+	{
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	} else if (ia_valid & ATTR_ATIME) {
+		in_mask |= IN_ACCESS;
+		dn_mask |= DN_ACCESS;
+	} else if (ia_valid & ATTR_MTIME) {
+		in_mask |= IN_MODIFY;
+		dn_mask |= DN_MODIFY;
+	}
+	if (ia_valid & ATTR_MODE) {
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	}
 
-	if (mask) {
+	if (dn_mask)
+		dnotify_parent(dentry, dn_mask);
+	if (in_mask) {
 		if (S_ISDIR(inode->i_mode))
-			mask |= FS_IN_ISDIR;
-		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
-		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+			in_mask |= IN_ISDIR;
+		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
+		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
+						  dentry->d_name.name);
 	}
 }
 
-#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY)	/* notify helpers */
+#ifdef CONFIG_INOTIFY	/* inotify helpers */
 
 /*
  * fsnotify_oldname_init - save off the old filename before we change it
@@ -322,7 +281,7 @@ static inline void fsnotify_oldname_free(const char *old_name)
 	kfree(old_name);
 }
 
-#else	/* CONFIG_INOTIFY || CONFIG_FSNOTIFY */
+#else	/* CONFIG_INOTIFY */
 
 static inline const char *fsnotify_oldname_init(const char *name)
 {
diff --git a/trunk/include/linux/fsnotify_backend.h b/trunk/include/linux/fsnotify_backend.h
deleted file mode 100644
index 44848aa830dc..000000000000
--- a/trunk/include/linux/fsnotify_backend.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Filesystem access notification for Linux
- *
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- */
-
-#ifndef __LINUX_FSNOTIFY_BACKEND_H
-#define __LINUX_FSNOTIFY_BACKEND_H
-
-#ifdef __KERNEL__
-
-#include <linux/idr.h> /* inotify uses this */
-#include <linux/fs.h> /* struct inode */
-#include <linux/list.h>
-#include <linux/path.h> /* struct path */
-#include <linux/spinlock.h>
-#include <linux/types.h>
-
-#include <asm/atomic.h>
-
-/*
- * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
- * convert between them.  dnotify only needs conversion at watch creation
- * so no perf loss there.  fanotify isn't defined yet, so it can use the
- * wholes if it needs more events.
- */
-#define FS_ACCESS		0x00000001	/* File was accessed */
-#define FS_MODIFY		0x00000002	/* File was modified */
-#define FS_ATTRIB		0x00000004	/* Metadata changed */
-#define FS_CLOSE_WRITE		0x00000008	/* Writtable file was closed */
-#define FS_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
-#define FS_OPEN			0x00000020	/* File was opened */
-#define FS_MOVED_FROM		0x00000040	/* File was moved from X */
-#define FS_MOVED_TO		0x00000080	/* File was moved to Y */
-#define FS_CREATE		0x00000100	/* Subfile was created */
-#define FS_DELETE		0x00000200	/* Subfile was deleted */
-#define FS_DELETE_SELF		0x00000400	/* Self was deleted */
-#define FS_MOVE_SELF		0x00000800	/* Self was moved */
-
-#define FS_UNMOUNT		0x00002000	/* inode on umount fs */
-#define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
-#define FS_IN_IGNORED		0x00008000	/* last inotify event here */
-
-#define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
-#define FS_IN_ONESHOT		0x80000000	/* only send event once */
-
-#define FS_DN_RENAME		0x10000000	/* file renamed */
-#define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
-
-/* This inode cares about things that happen to its children.  Always set for
- * dnotify and inotify. */
-#define FS_EVENT_ON_CHILD	0x08000000
-
-/* This is a list of all events that may get sent to a parernt based on fs event
- * happening to inodes inside that directory */
-#define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
-				   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
-				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
-				   FS_DELETE)
-
-/* listeners that hard code group numbers near the top */
-#define DNOTIFY_GROUP_NUM	UINT_MAX
-#define INOTIFY_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
-
-struct fsnotify_group;
-struct fsnotify_event;
-struct fsnotify_mark_entry;
-struct fsnotify_event_private_data;
-
-/*
- * Each group much define these ops.  The fsnotify infrastructure will call
- * these operations for each relevant group.
- *
- * should_send_event - given a group, inode, and mask this function determines
- *		if the group is interested in this event.
- * handle_event - main call for a group to handle an fs event
- * free_group_priv - called when a group refcnt hits 0 to clean up the private union
- * freeing-mark - this means that a mark has been flagged to die when everything
- *		finishes using it.  The function is supplied with what must be a
- *		valid group and inode to use to clean up.
- */
-struct fsnotify_ops {
-	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask);
-	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
-	void (*free_group_priv)(struct fsnotify_group *group);
-	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
-	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
-};
-
-/*
- * A group is a "thing" that wants to receive notification about filesystem
- * events.  The mask holds the subset of event types this group cares about.
- * refcnt on a group is up to the implementor and at any moment if it goes 0
- * everything will be cleaned up.
- */
-struct fsnotify_group {
-	/*
-	 * global list of all groups receiving events from fsnotify.
-	 * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
-	 * or fsnotify_grp_srcu depending on write vs read.
-	 */
-	struct list_head group_list;
-
-	/*
-	 * Defines all of the event types in which this group is interested.
-	 * This mask is a bitwise OR of the FS_* events from above.  Each time
-	 * this mask changes for a group (if it changes) the correct functions
-	 * must be called to update the global structures which indicate global
-	 * interest in event types.
-	 */
-	__u32 mask;
-
-	/*
-	 * How the refcnt is used is up to each group.  When the refcnt hits 0
-	 * fsnotify will clean up all of the resources associated with this group.
-	 * As an example, the dnotify group will always have a refcnt=1 and that
-	 * will never change.  Inotify, on the other hand, has a group per
-	 * inotify_init() and the refcnt will hit 0 only when that fd has been
-	 * closed.
-	 */
-	atomic_t refcnt;		/* things with interest in this group */
-	unsigned int group_num;		/* simply prevents accidental group collision */
-
-	const struct fsnotify_ops *ops;	/* how this group handles things */
-
-	/* needed to send notification to userspace */
-	struct mutex notification_mutex;	/* protect the notification_list */
-	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace */
-	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
-	unsigned int q_len;			/* events on the queue */
-	unsigned int max_events;		/* maximum events allowed on the list */
-
-	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
-	spinlock_t mark_lock;		/* protect mark_entries list */
-	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
-					 * past the point of no return when freeing
-					 * a group */
-	struct list_head mark_entries;	/* all inode mark entries for this group */
-
-	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
-	bool on_group_list;
-
-	/* groups can define private fields here or use the void *private */
-	union {
-		void *private;
-#ifdef CONFIG_INOTIFY_USER
-		struct inotify_group_private_data {
-			spinlock_t	idr_lock;
-			struct idr      idr;
-			u32             last_wd;
-			struct fasync_struct    *fa;    /* async notification */
-			struct user_struct      *user;
-		} inotify_data;
-#endif
-	};
-};
-
-/*
- * A single event can be queued in multiple group->notification_lists.
- *
- * each group->notification_list will point to an event_holder which in turns points
- * to the actual event that needs to be sent to userspace.
- *
- * Seemed cheaper to create a refcnt'd event and a small holder for every group
- * than create a different event for every group
- *
- */
-struct fsnotify_event_holder {
-	struct fsnotify_event *event;
-	struct list_head event_list;
-};
-
-/*
- * Inotify needs to tack data onto an event.  This struct lets us later find the
- * correct private data of the correct group.
- */
-struct fsnotify_event_private_data {
-	struct fsnotify_group *group;
-	struct list_head event_list;
-};
-
-/*
- * all of the information about the original object we want to now send to
- * a group.  If you want to carry more info from the accessing task to the
- * listener this structure is where you need to be adding fields.
- */
-struct fsnotify_event {
-	/*
-	 * If we create an event we are also likely going to need a holder
-	 * to link to a group.  So embed one holder in the event.  Means only
-	 * one allocation for the common case where we only have one group
-	 */
-	struct fsnotify_event_holder holder;
-	spinlock_t lock;	/* protection for the associated event_holder and private_list */
-	/* to_tell may ONLY be dereferenced during handle_event(). */
-	struct inode *to_tell;	/* either the inode the event happened to or its parent */
-	/*
-	 * depending on the event type we should have either a path or inode
-	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
-	 * the path, it may be dereferenced at any point during this object's
-	 * lifetime.  That reference is dropped when this object's refcnt hits
-	 * 0.  If this event contains an inode instead of a path, the inode may
-	 * ONLY be used during handle_event().
-	 */
-	union {
-		struct path path;
-		struct inode *inode;
-	};
-/* when calling fsnotify tell it if the data is a path or inode */
-#define FSNOTIFY_EVENT_NONE	0
-#define FSNOTIFY_EVENT_PATH	1
-#define FSNOTIFY_EVENT_INODE	2
-#define FSNOTIFY_EVENT_FILE	3
-	int data_type;		/* which of the above union we have */
-	atomic_t refcnt;	/* how many groups still are using/need to send this event */
-	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
-
-	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
-	char *file_name;
-	size_t name_len;
-
-	struct list_head private_data_list;	/* groups can store private data here */
-};
-
-/*
- * a mark is simply an entry attached to an in core inode which allows an
- * fsnotify listener to indicate they are either no longer interested in events
- * of a type matching mask or only interested in those events.
- *
- * these are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
- * (such as dnotify) will flush these when the open fd is closed and not at
- * inode eviction or modification.
- */
-struct fsnotify_mark_entry {
-	__u32 mask;			/* mask this mark entry is for */
-	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
-	 * in kernel that found and may be using this mark. */
-	atomic_t refcnt;		/* active things looking at this mark */
-	struct inode *inode;		/* inode this entry is associated with */
-	struct fsnotify_group *group;	/* group this mark entry is for */
-	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
-	struct list_head g_list;	/* list of mark_entries by group->i_fsnotify_mark_entries */
-	spinlock_t lock;		/* protect group, inode, and killme */
-	struct list_head free_i_list;	/* tmp list used when freeing this mark */
-	struct list_head free_g_list;	/* tmp list used when freeing this mark */
-	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
-};
-
-#ifdef CONFIG_FSNOTIFY
-
-/* called from the vfs helpers */
-
-/* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-		     const char *name, u32 cookie);
-extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
-extern void __fsnotify_inode_delete(struct inode *inode);
-extern u32 fsnotify_get_cookie(void);
-
-static inline int fsnotify_inode_watches_children(struct inode *inode)
-{
-	/* FS_EVENT_ON_CHILD is set if the inode may care */
-	if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
-		return 0;
-	/* this inode might care about child events, does it care about the
-	 * specific set of events that can happen on a child? */
-	return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
-}
-
-/*
- * Update the dentry with a flag indicating the interest of its parent to receive
- * filesystem events when those events happens to this dentry->d_inode.
- */
-static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
-{
-	struct dentry *parent;
-
-	assert_spin_locked(&dcache_lock);
-	assert_spin_locked(&dentry->d_lock);
-
-	parent = dentry->d_parent;
-	if (fsnotify_inode_watches_children(parent->d_inode))
-		dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
-	else
-		dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
-}
-
-/*
- * fsnotify_d_instantiate - instantiate a dentry for inode
- * Called with dcache_lock held.
- */
-static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
-{
-	if (!inode)
-		return;
-
-	assert_spin_locked(&dcache_lock);
-
-	spin_lock(&dentry->d_lock);
-	__fsnotify_update_dcache_flags(dentry);
-	spin_unlock(&dentry->d_lock);
-}
-
-/* called from fsnotify listeners, such as fanotify or dnotify */
-
-/* must call when a group changes its ->mask */
-extern void fsnotify_recalc_global_mask(void);
-/* get a reference to an existing or create a new group */
-extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
-						    __u32 mask,
-						    const struct fsnotify_ops *ops);
-/* run all marks associated with this group and update group->mask */
-extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
-/* drop reference on a group from fsnotify_obtain_group */
-extern void fsnotify_put_group(struct fsnotify_group *group);
-
-/* take a reference to an event */
-extern void fsnotify_get_event(struct fsnotify_event *event);
-extern void fsnotify_put_event(struct fsnotify_event *event);
-/* find private data previously attached to an event and unlink it */
-extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
-									   struct fsnotify_event *event);
-
-/* attach the event to the group notification queue */
-extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-				     struct fsnotify_event_private_data *priv);
-/* true if the group notification queue is empty */
-extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
-/* return, but do not dequeue the first event on the notification queue */
-extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
-/* return AND dequeue the first event on the notification queue */
-extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
-
-/* functions used to manipulate the marks attached to inodes */
-
-/* run all marks associated with an inode and update inode->i_fsnotify_mask */
-extern void fsnotify_recalc_inode_mask(struct inode *inode);
-extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry));
-/* find (and take a reference) to a mark associated with group and inode */
-extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
-/* attach the mark to both the group and the inode */
-extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode);
-/* given a mark, flag it to be freed when all references are dropped */
-extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
-extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
-extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
-extern void fsnotify_unmount_inodes(struct list_head *list);
-
-/* put here because inotify does some weird stuff when destroying watches */
-extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is, const char *name,
-						    u32 cookie);
-
-#else
-
-static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-			    const char *name, u32 cookie)
-{}
-
-static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
-{}
-
-static inline void __fsnotify_inode_delete(struct inode *inode)
-{}
-
-static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
-{}
-
-static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
-{}
-
-static inline u32 fsnotify_get_cookie(void)
-{
-	return 0;
-}
-
-static inline void fsnotify_unmount_inodes(struct list_head *list)
-{}
-
-#endif	/* CONFIG_FSNOTIFY */
-
-#endif	/* __KERNEL __ */
-
-#endif	/* __LINUX_FSNOTIFY_BACKEND_H */
diff --git a/trunk/include/linux/magic.h b/trunk/include/linux/magic.h
index 1923327b9869..927138cf3050 100644
--- a/trunk/include/linux/magic.h
+++ b/trunk/include/linux/magic.h
@@ -6,8 +6,6 @@
 #define AFS_SUPER_MAGIC                0x5346414F
 #define AUTOFS_SUPER_MAGIC	0x0187
 #define CODA_SUPER_MAGIC	0x73757245
-#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
-#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
diff --git a/trunk/include/linux/mount.h b/trunk/include/linux/mount.h
index 5d5275364867..51f55f903aff 100644
--- a/trunk/include/linux/mount.h
+++ b/trunk/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
 #define MNT_STRICTATIME 0x80
 
 #define MNT_SHRINKABLE	0x100
-#define MNT_WRITE_HOLD	0x200
+#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -65,22 +65,13 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
-#ifdef CONFIG_SMP
-	int *mnt_writers;
-#else
-	int mnt_writers;
-#endif
+	/*
+	 * This value is not stable unless all of the mnt_writers[] spinlocks
+	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+	 */
+	atomic_t __mnt_writers;
 };
 
-static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	return mnt->mnt_writers;
-#else
-	return &mnt->mnt_writers;
-#endif
-}
-
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
@@ -88,11 +79,7 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 	return mnt;
 }
 
-struct file; /* forward dec */
-
 extern int mnt_want_write(struct vfsmount *mnt);
-extern int mnt_want_write_file(struct file *file);
-extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
diff --git a/trunk/include/linux/namei.h b/trunk/include/linux/namei.h
index d870ae2faedc..518098fe63af 100644
--- a/trunk/include/linux/namei.h
+++ b/trunk/include/linux/namei.h
@@ -18,7 +18,6 @@ enum { MAX_NESTED_LINKS = 8 };
 struct nameidata {
 	struct path	path;
 	struct qstr	last;
-	struct path	root;
 	unsigned int	flags;
 	int		last_type;
 	unsigned	depth;
@@ -78,8 +77,8 @@ extern void release_open_intent(struct nameidata *);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
-extern int follow_down(struct path *);
-extern int follow_up(struct path *);
+extern int follow_down(struct vfsmount **, struct dentry **);
+extern int follow_up(struct vfsmount **, struct dentry **);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
diff --git a/trunk/include/linux/nfsd/export.h b/trunk/include/linux/nfsd/export.h
index a6d9ef2bb34a..bcd0201589f8 100644
--- a/trunk/include/linux/nfsd/export.h
+++ b/trunk/include/linux/nfsd/export.h
@@ -125,9 +125,11 @@ void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
-					     struct path *);
+					     struct vfsmount *,
+					     struct dentry *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
-					struct path *);
+					struct vfsmount *mnt,
+					struct dentry *dentry);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h
index e6e77d31c418..fbfa3d44d33d 100644
--- a/trunk/include/linux/proc_fs.h
+++ b/trunk/include/linux/proc_fs.h
@@ -93,9 +93,20 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
+extern spinlock_t proc_subdir_lock;
+
 extern void proc_root_init(void);
 
 void proc_flush_task(struct task_struct *task);
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+void clear_refs_smap(struct mm_struct *mm);
+
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 						struct proc_dir_entry *parent);
@@ -105,7 +116,20 @@ struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
 				void *data);
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 
+extern struct vfsmount *proc_mnt;
 struct pid_namespace;
+extern int proc_fill_super(struct super_block *);
+extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+extern int proc_readdir(struct file *, void *, filldir_t);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
diff --git a/trunk/include/linux/qnx4_fs.h b/trunk/include/linux/qnx4_fs.h
index 8b9aee1a9ce3..787d19ea9f46 100644
--- a/trunk/include/linux/qnx4_fs.h
+++ b/trunk/include/linux/qnx4_fs.h
@@ -85,4 +85,65 @@ struct qnx4_super_block {
 	struct qnx4_inode_entry AltBoot;
 };
 
+#ifdef __KERNEL__
+
+#define QNX4_DEBUG 0
+
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+
+struct qnx4_sb_info {
+	struct buffer_head	*sb_buf;	/* superblock buffer */
+	struct qnx4_super_block	*sb;		/* our superblock */
+	unsigned int		Version;	/* may be useful */
+	struct qnx4_inode_entry	*BitMap;	/* useful */
+};
+
+struct qnx4_inode_info {
+	struct qnx4_inode_entry raw;
+	loff_t mmu_private;
+	struct inode vfs_inode;
+};
+
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+extern int qnx4_sync_file(struct file *file, struct dentry *dentry, int);
+extern int qnx4_sync_inode(struct inode *inode);
+
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+	return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+	return &qnx4_i(inode)->raw;
+}
+
+#endif				/* __KERNEL__ */
+
 #endif
diff --git a/trunk/include/linux/quotaops.h b/trunk/include/linux/quotaops.h
index 7bc457593684..36353d95c8db 100644
--- a/trunk/include/linux/quotaops.h
+++ b/trunk/include/linux/quotaops.h
@@ -20,12 +20,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 /*
  * declaration of quota_function calls in kernel.
  */
-void sync_quota_sb(struct super_block *sb, int type);
-static inline void writeout_quota_sb(struct super_block *sb, int type)
-{
-	if (sb->s_qcop->quota_sync)
-		sb->s_qcop->quota_sync(sb, type);
-}
+void sync_dquots(struct super_block *sb, int type);
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -258,7 +253,12 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 
-/* Cannot be called inside a transaction */
+/* The following two functions cannot be called inside a transaction */
+static inline void vfs_dq_sync(struct super_block *sb)
+{
+	sync_dquots(sb, -1);
+}
+
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	int ret = -ENOSYS;
@@ -334,11 +334,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 {
 }
 
-static inline void sync_quota_sb(struct super_block *sb, int type)
-{
-}
-
-static inline void writeout_quota_sb(struct super_block *sb, int type)
+static inline void vfs_dq_sync(struct super_block *sb)
 {
 }
 
diff --git a/trunk/include/linux/reiserfs_fs_sb.h b/trunk/include/linux/reiserfs_fs_sb.h
index dab68bbed675..6473650c28f1 100644
--- a/trunk/include/linux/reiserfs_fs_sb.h
+++ b/trunk/include/linux/reiserfs_fs_sb.h
@@ -453,7 +453,6 @@ enum reiserfs_mount_options {
 	REISERFS_ATTRS,
 	REISERFS_XATTRS_USER,
 	REISERFS_POSIXACL,
-	REISERFS_EXPOSE_PRIVROOT,
 	REISERFS_BARRIER_NONE,
 	REISERFS_BARRIER_FLUSH,
 
@@ -491,7 +490,6 @@ enum reiserfs_mount_options {
 #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
-#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
 #define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
 #define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
diff --git a/trunk/include/linux/writeback.h b/trunk/include/linux/writeback.h
index 3224820c8514..93445477f86a 100644
--- a/trunk/include/linux/writeback.h
+++ b/trunk/include/linux/writeback.h
@@ -79,6 +79,7 @@ struct writeback_control {
 void writeback_inodes(struct writeback_control *wbc);
 int inode_wait(void *);
 void sync_inodes_sb(struct super_block *, int wait);
+void sync_inodes(int wait);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig
index c649657e2259..9b68fee8d79e 100644
--- a/trunk/init/Kconfig
+++ b/trunk/init/Kconfig
@@ -302,8 +302,7 @@ config AUDITSYSCALL
 
 config AUDIT_TREE
 	def_bool y
-	depends on AUDITSYSCALL
-	select INOTIFY
+	depends on AUDITSYSCALL && INOTIFY
 
 menu "RCU Subsystem"
 
diff --git a/trunk/kernel/audit_tree.c b/trunk/kernel/audit_tree.c
index 1f6396d76687..6e7351739a82 100644
--- a/trunk/kernel/audit_tree.c
+++ b/trunk/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(&path);
+		root_mnt = collect_mounts(path.mnt, path.dentry);
 		path_put(&path);
 		if (!root_mnt)
 			goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(&path);
+	mnt = collect_mounts(path.mnt, path.dentry);
 	path_put(&path);
 	if (!mnt) {
 		err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
 	err = kern_path(new, 0, &path);
 	if (err)
 		return err;
-	tagged = collect_mounts(&path);
+	tagged = collect_mounts(path.mnt, path.dentry);
 	path_put(&path);
 	if (!tagged)
 		return -ENOMEM;
diff --git a/trunk/kernel/cgroup.c b/trunk/kernel/cgroup.c
index 3fb789f6df94..a7267bfd3765 100644
--- a/trunk/kernel/cgroup.c
+++ b/trunk/kernel/cgroup.c
@@ -46,7 +46,6 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
-#include <linux/smp_lock.h>
 
 #include <asm/atomic.h>
 
@@ -901,7 +900,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 
-	lock_kernel();
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -929,7 +927,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-	unlock_kernel();
 	return ret;
 }
 
diff --git a/trunk/virt/kvm/kvm_main.c b/trunk/virt/kvm/kvm_main.c
index 764554350ed8..e21194566b71 100644
--- a/trunk/virt/kvm/kvm_main.c
+++ b/trunk/virt/kvm/kvm_main.c
@@ -2604,6 +2604,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 		r = -ENOMEM;
 		goto out_free_0;
 	}
+	cpumask_clear(cpus_hardware_enabled);
 
 	r = kvm_arch_hardware_setup();
 	if (r < 0)