From 79453cfb209e43a0fcddf93170d4ad4674647fda Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au.ibm.com>
Date: Wed, 2 Feb 2011 17:27:24 +0000
Subject: [PATCH] --- yaml --- r: 251819 b: refs/heads/master c:
 02424d8966d803e33cbe51469be56b5d177b4a37 h: refs/heads/master i:   251817:
 e2c89b6385ad0b7dc8ebda193cadeb8c53600d29   251815:
 1b6dcb9e835f09026789de09991bf44dc5b1fc4a v: v3

---
 [refs]                                        |    2 +-
 .../ABI/{removed => obsolete}/o2cb            |    9 +-
 .../ABI/testing/sysfs-kernel-mm-cleancache    |   11 -
 .../feature-removal-schedule.txt              |   10 +
 trunk/Documentation/filesystems/ext4.txt      |    4 +
 trunk/Documentation/filesystems/ocfs2.txt     |    8 +-
 trunk/Documentation/filesystems/xfs.txt       |    6 -
 trunk/Documentation/vm/cleancache.txt         |  278 ----
 trunk/MAINTAINERS                             |   13 +-
 trunk/arch/powerpc/Kconfig                    |    1 +
 trunk/arch/powerpc/include/asm/ftrace.h       |   14 +
 trunk/arch/powerpc/include/asm/syscall.h      |    5 +
 trunk/arch/powerpc/include/asm/thread_info.h  |    7 +-
 trunk/arch/powerpc/kernel/Makefile            |    1 +
 trunk/arch/powerpc/kernel/ftrace.c            |    8 +
 trunk/arch/powerpc/kernel/ptrace.c            |   10 +
 trunk/arch/x86/include/asm/xen/hypercall.h    |    7 -
 trunk/drivers/video/mb862xx/mb862xx-i2c.c     |    1 -
 trunk/drivers/xen/Makefile                    |    1 -
 trunk/drivers/xen/tmem.c                      |  264 ---
 trunk/fs/9p/vfs_inode.c                       |    4 -
 trunk/fs/Kconfig                              |   31 +-
 trunk/fs/affs/namei.c                         |    5 -
 trunk/fs/afs/dir.c                            |    5 -
 trunk/fs/autofs4/root.c                       |    2 -
 trunk/fs/bfs/dir.c                            |    3 -
 trunk/fs/btrfs/extent_io.c                    |    9 -
 trunk/fs/btrfs/super.c                        |    2 -
 trunk/fs/buffer.c                             |   64 +-
 trunk/fs/coda/dir.c                           |    5 -
 trunk/fs/configfs/dir.c                       |    2 -
 trunk/fs/ecryptfs/inode.c                     |    5 -
 trunk/fs/ext3/super.c                         |    2 -
 trunk/fs/ext4/Makefile                        |    3 +-
 trunk/fs/ext4/balloc.c                        |  146 +-
 trunk/fs/ext4/ext4.h                          |  127 +-
 trunk/fs/ext4/ext4_jbd2.c                     |   14 +
 trunk/fs/ext4/ext4_jbd2.h                     |    5 +
 trunk/fs/ext4/extents.c                       | 1416 +++++++----------
 trunk/fs/ext4/file.c                          |    1 +
 trunk/fs/ext4/fsync.c                         |   25 +-
 trunk/fs/ext4/inode.c                         |  114 +-
 trunk/fs/ext4/mballoc.c                       |  459 +++---
 trunk/fs/ext4/mballoc.h                       |    6 +
 trunk/fs/ext4/migrate.c                       |    2 +-
 trunk/fs/ext4/mmp.c                           |  351 ----
 trunk/fs/ext4/move_extent.c                   |    3 +-
 trunk/fs/ext4/namei.c                         |   82 +-
 trunk/fs/ext4/page-io.c                       |   39 +-
 trunk/fs/ext4/super.c                         |  206 +--
 trunk/fs/ext4/xattr.c                         |    4 +-
 trunk/fs/fat/namei_msdos.c                    |    5 -
 trunk/fs/fat/namei_vfat.c                     |    5 -
 trunk/fs/fuse/dir.c                           |    6 -
 trunk/fs/hfs/dir.c                            |    6 -
 trunk/fs/hfsplus/dir.c                        |    8 +-
 trunk/fs/hostfs/hostfs_kern.c                 |    5 -
 trunk/fs/hpfs/namei.c                         |    9 +-
 trunk/fs/hugetlbfs/inode.c                    |    3 +-
 trunk/fs/jbd2/commit.c                        |   22 +-
 trunk/fs/jbd2/journal.c                       |   58 +-
 trunk/fs/jbd2/transaction.c                   |   22 +-
 trunk/fs/jffs2/dir.c                          |    5 -
 trunk/fs/jfs/namei.c                          |    5 -
 trunk/fs/logfs/dir.c                          |    5 -
 trunk/fs/minix/namei.c                        |    5 -
 trunk/fs/mpage.c                              |    7 -
 trunk/fs/namei.c                              |  380 +++--
 trunk/fs/namespace.c                          |    2 +-
 trunk/fs/ncpfs/dir.c                          |    5 -
 trunk/fs/nilfs2/namei.c                       |    5 -
 trunk/fs/ocfs2/Makefile                       |    1 -
 trunk/fs/ocfs2/alloc.c                        |  166 --
 trunk/fs/ocfs2/alloc.h                        |    1 -
 trunk/fs/ocfs2/cluster/sys.c                  |    9 +
 trunk/fs/ocfs2/dlm/dlmcommon.h                |   14 -
 trunk/fs/ocfs2/dlm/dlmdebug.c                 |    6 -
 trunk/fs/ocfs2/dlm/dlmdomain.c                |   94 +-
 trunk/fs/ocfs2/dlm/dlmmaster.c                |  255 +--
 trunk/fs/ocfs2/dlm/dlmrecovery.c              |    1 -
 trunk/fs/ocfs2/dlmfs/dlmfs.c                  |    2 +-
 trunk/fs/ocfs2/file.c                         |    1 -
 trunk/fs/ocfs2/ioctl.c                        |  492 +-----
 trunk/fs/ocfs2/move_extents.c                 | 1153 --------------
 trunk/fs/ocfs2/move_extents.h                 |   22 -
 trunk/fs/ocfs2/ocfs2_ioctl.h                  |   68 -
 trunk/fs/ocfs2/ocfs2_trace.h                  |   25 -
 trunk/fs/ocfs2/refcounttree.c                 |   58 +-
 trunk/fs/ocfs2/refcounttree.h                 |   11 -
 trunk/fs/ocfs2/super.c                        |    4 +-
 trunk/fs/omfs/dir.c                           |   11 +-
 trunk/fs/proc/Makefile                        |    1 -
 trunk/fs/proc/base.c                          |   20 +-
 trunk/fs/proc/inode.c                         |    7 -
 trunk/fs/proc/internal.h                      |   18 -
 trunk/fs/proc/namespaces.c                    |  198 ---
 trunk/fs/proc/task_mmu.c                      |    2 +-
 trunk/fs/reiserfs/namei.c                     |    5 -
 trunk/fs/reiserfs/xattr.c                     |    1 +
 trunk/fs/super.c                              |    3 -
 trunk/fs/sysv/namei.c                         |    5 -
 trunk/fs/ubifs/dir.c                          |    5 -
 trunk/fs/udf/namei.c                          |    5 -
 trunk/fs/ufs/namei.c                          |    5 -
 trunk/fs/xfs/linux-2.6/xfs_discard.c          |   29 -
 trunk/fs/xfs/linux-2.6/xfs_discard.h          |    2 -
 trunk/fs/xfs/linux-2.6/xfs_super.c            |   18 +-
 trunk/fs/xfs/xfs_ag.h                         |    3 -
 trunk/fs/xfs/xfs_alloc.c                      |   35 +-
 trunk/fs/xfs/xfs_alloc.h                      |    5 +-
 trunk/fs/xfs/xfs_alloc_btree.c                |    3 +-
 trunk/fs/xfs/xfs_bmap.c                       |  549 ++++---
 trunk/fs/xfs/xfs_bmap.h                       |    2 +
 trunk/fs/xfs/xfs_inode.c                      |   15 +-
 trunk/fs/xfs/xfs_inode.h                      |    1 +
 trunk/fs/xfs/xfs_log_cil.c                    |   13 +-
 trunk/fs/xfs/xfs_mount.h                      |    1 -
 trunk/fs/xfs/xfs_trans.c                      |    2 +-
 trunk/include/linux/buffer_head.h             |   16 -
 trunk/include/linux/cleancache.h              |  122 --
 trunk/include/linux/fs.h                      |    5 -
 trunk/include/linux/hugetlb.h                 |    7 +-
 trunk/include/linux/hugetlb_inline.h          |    2 +-
 trunk/include/linux/if_link.h                 |    1 -
 trunk/include/linux/jbd2.h                    |    8 +-
 trunk/include/linux/mm.h                      |    6 +-
 trunk/include/linux/mm_types.h                |    4 +-
 trunk/include/linux/proc_fs.h                 |   21 -
 trunk/include/linux/syscalls.h                |    1 -
 trunk/include/net/net_namespace.h             |    1 -
 trunk/include/xen/interface/xen.h             |   22 -
 trunk/ipc/namespace.c                         |   37 -
 trunk/ipc/shm.c                               |    2 +-
 trunk/kernel/nsproxy.c                        |   42 -
 trunk/kernel/utsname.c                        |   39 -
 trunk/mm/Kconfig                              |   23 -
 trunk/mm/Makefile                             |    1 -
 trunk/mm/cleancache.c                         |  244 ---
 trunk/mm/filemap.c                            |   11 -
 trunk/mm/fremap.c                             |    2 +-
 trunk/mm/hugetlb.c                            |    4 +-
 trunk/mm/memory.c                             |    2 +-
 trunk/mm/mlock.c                              |    8 +-
 trunk/mm/mmap.c                               |    8 +-
 trunk/mm/slub.c                               |    1 +
 trunk/mm/truncate.c                           |    6 -
 trunk/net/core/net_namespace.c                |   65 -
 trunk/net/core/rtnetlink.c                    |    5 +-
 148 files changed, 2050 insertions(+), 6383 deletions(-)
 rename trunk/Documentation/ABI/{removed => obsolete}/o2cb (65%)
 delete mode 100644 trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
 delete mode 100644 trunk/Documentation/vm/cleancache.txt
 delete mode 100644 trunk/drivers/xen/tmem.c
 delete mode 100644 trunk/fs/ext4/mmp.c
 delete mode 100644 trunk/fs/ocfs2/move_extents.c
 delete mode 100644 trunk/fs/ocfs2/move_extents.h
 delete mode 100644 trunk/fs/proc/namespaces.c
 delete mode 100644 trunk/include/linux/cleancache.h
 delete mode 100644 trunk/mm/cleancache.c

diff --git a/[refs] b/[refs]
index e5a7f504fe7e..26644b9c747b 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: be93d8cfbae1996052e91b2883d306a5d9d0fe18
+refs/heads/master: 02424d8966d803e33cbe51469be56b5d177b4a37
diff --git a/trunk/Documentation/ABI/removed/o2cb b/trunk/Documentation/ABI/obsolete/o2cb
similarity index 65%
rename from trunk/Documentation/ABI/removed/o2cb
rename to trunk/Documentation/ABI/obsolete/o2cb
index 7f5daa465093..9c49d8e6c0cc 100644
--- a/trunk/Documentation/ABI/removed/o2cb
+++ b/trunk/Documentation/ABI/obsolete/o2cb
@@ -1,10 +1,11 @@
 What:		/sys/o2cb symlink
-Date:		May 2011
-KernelVersion:	2.6.40
+Date:		Dec 2005
+KernelVersion:	2.6.16
 Contact:	ocfs2-devel@oss.oracle.com
-Description:	This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
-		removed when new versions of ocfs2-tools which know to look
+Description:	This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
+		be removed when new versions of ocfs2-tools which know to look
 		in /sys/fs/o2cb are sufficiently prevalent. Don't code new
 		software to look here, it should try /sys/fs/o2cb instead.
+		See Documentation/ABI/stable/o2cb for more information on usage.
 Users:		ocfs2-tools. It's sufficient to mail proposed changes to
 		ocfs2-devel@oss.oracle.com.
diff --git a/trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
deleted file mode 100644
index 662ae646ea12..000000000000
--- a/trunk/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
+++ /dev/null
@@ -1,11 +0,0 @@
-What:		/sys/kernel/mm/cleancache/
-Date:		April 2011
-Contact:	Dan Magenheimer <dan.magenheimer@oracle.com>
-Description:
-		/sys/kernel/mm/cleancache/ contains a number of files which
-		record a count of various cleancache operations
-		(sum across all filesystems):
-			succ_gets
-			failed_gets
-			puts
-			flushes
diff --git a/trunk/Documentation/feature-removal-schedule.txt b/trunk/Documentation/feature-removal-schedule.txt
index ff31b1cc50aa..95788ad2506c 100644
--- a/trunk/Documentation/feature-removal-schedule.txt
+++ b/trunk/Documentation/feature-removal-schedule.txt
@@ -262,6 +262,16 @@ Who:	Michael Buesch <mb@bu3sch.de>
 
 ---------------------------
 
+What:	/sys/o2cb symlink
+When:	January 2010
+Why:	/sys/fs/o2cb is the proper location for this information - /sys/o2cb
+	exists as a symlink for backwards compatibility for old versions of
+	ocfs2-tools. 2 years should be sufficient time to phase in new versions
+	which know to look in /sys/fs/o2cb.
+Who:	ocfs2-devel@oss.oracle.com
+
+---------------------------
+
 What:	Ability for non root users to shm_get hugetlb pages based on mlock
 	resource limits
 When:	2.6.31
diff --git a/trunk/Documentation/filesystems/ext4.txt b/trunk/Documentation/filesystems/ext4.txt
index 3ae9bc94352a..c79ec58fd7f6 100644
--- a/trunk/Documentation/filesystems/ext4.txt
+++ b/trunk/Documentation/filesystems/ext4.txt
@@ -226,6 +226,10 @@ acl			Enables POSIX Access Control Lists support.
 noacl			This option disables POSIX Access Control List
 			support.
 
+reservation
+
+noreservation
+
 bsddf		(*)	Make 'df' act like BSD.
 minixdf			Make 'df' act like Minix.
 
diff --git a/trunk/Documentation/filesystems/ocfs2.txt b/trunk/Documentation/filesystems/ocfs2.txt
index 7618a287aa41..9ed920a8cd79 100644
--- a/trunk/Documentation/filesystems/ocfs2.txt
+++ b/trunk/Documentation/filesystems/ocfs2.txt
@@ -46,15 +46,9 @@ errors=panic		Panic and halt the machine if an error occurs.
 intr		(*)	Allow signals to interrupt cluster operations.
 nointr			Do not allow signals to interrupt cluster
 			operations.
-noatime			Do not update access time.
-relatime(*)		Update atime if the previous atime is older than
-			mtime or ctime
-strictatime		Always update atime, but the minimum update interval
-			is specified by atime_quantum.
 atime_quantum=60(*)	OCFS2 will not update atime unless this number
 			of seconds has passed since the last update.
-			Set to zero to always update atime. This option need
-			work with strictatime.
+			Set to zero to always update atime.
 data=ordered	(*)	All data are forced directly out to the main file
 			system prior to its metadata being committed to the
 			journal.
diff --git a/trunk/Documentation/filesystems/xfs.txt b/trunk/Documentation/filesystems/xfs.txt
index 3fc0c31a6f5d..7bff3e4f35df 100644
--- a/trunk/Documentation/filesystems/xfs.txt
+++ b/trunk/Documentation/filesystems/xfs.txt
@@ -39,12 +39,6 @@ When mounting an XFS filesystem, the following options are accepted.
 	drive level write caching to be enabled, for devices that
 	support write barriers.
 
-  discard
-	Issue command to let the block device reclaim space freed by the
-	filesystem.  This is useful for SSD devices, thinly provisioned
-	LUNs and virtual machine images, but may have a performance
-	impact.  This option is incompatible with the nodelaylog option.
-
   dmapi
 	Enable the DMAPI (Data Management API) event callouts.
 	Use with the "mtpt" option.
diff --git a/trunk/Documentation/vm/cleancache.txt b/trunk/Documentation/vm/cleancache.txt
deleted file mode 100644
index 36c367c73084..000000000000
--- a/trunk/Documentation/vm/cleancache.txt
+++ /dev/null
@@ -1,278 +0,0 @@
-MOTIVATION
-
-Cleancache is a new optional feature provided by the VFS layer that
-potentially dramatically increases page cache effectiveness for
-many workloads in many environments at a negligible cost.
-
-Cleancache can be thought of as a page-granularity victim cache for clean
-pages that the kernel's pageframe replacement algorithm (PFRA) would like
-to keep around, but can't since there isn't enough memory.  So when the
-PFRA "evicts" a page, it first attempts to use cleancache code to
-put the data contained in that page into "transcendent memory", memory
-that is not directly accessible or addressable by the kernel and is
-of unknown and possibly time-varying size.
-
-Later, when a cleancache-enabled filesystem wishes to access a page
-in a file on disk, it first checks cleancache to see if it already
-contains it; if it does, the page of data is copied into the kernel
-and a disk access is avoided.
-
-Transcendent memory "drivers" for cleancache are currently implemented
-in Xen (using hypervisor memory) and zcache (using in-kernel compressed
-memory) and other implementations are in development.
-
-FAQs are included below.
-
-IMPLEMENTATION OVERVIEW
-
-A cleancache "backend" that provides transcendent memory registers itself
-to the kernel's cleancache "frontend" by calling cleancache_register_ops,
-passing a pointer to a cleancache_ops structure with funcs set appropriately.
-Note that cleancache_register_ops returns the previous settings so that
-chaining can be performed if desired. The functions provided must conform to
-certain semantics as follows:
-
-Most important, cleancache is "ephemeral".  Pages which are copied into
-cleancache have an indefinite lifetime which is completely unknowable
-by the kernel and so may or may not still be in cleancache at any later time.
-Thus, as its name implies, cleancache is not suitable for dirty pages.
-Cleancache has complete discretion over what pages to preserve and what
-pages to discard and when.
-
-Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
-pool id which, if positive, must be saved in the filesystem's superblock;
-a negative return value indicates failure.  A "put_page" will copy a
-(presumably about-to-be-evicted) page into cleancache and associate it with
-the pool id, a file key, and a page index into the file.  (The combination
-of a pool id, a file key, and an index is sometimes called a "handle".)
-A "get_page" will copy the page, if found, from cleancache into kernel memory.
-A "flush_page" will ensure the page no longer is present in cleancache;
-a "flush_inode" will flush all pages associated with the specified file;
-and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
-all files specified by the given pool id and also surrender the pool id.
-
-An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
-to treat the pool as shared using a 128-bit UUID as a key.  On systems
-that may run multiple kernels (such as hard partitioned or virtualized
-systems) that may share a clustered filesystem, and where cleancache
-may be shared among those kernels, calls to init_shared_fs that specify the
-same UUID will receive the same pool id, thus allowing the pages to
-be shared.  Note that any security requirements must be imposed outside
-of the kernel (e.g. by "tools" that control cleancache).  Or a
-cleancache implementation can simply disable shared_init by always
-returning a negative value.
-
-If a get_page is successful on a non-shared pool, the page is flushed (thus
-making cleancache an "exclusive" cache).  On a shared pool, the page
-is NOT flushed on a successful get_page so that it remains accessible to
-other sharers.  The kernel is responsible for ensuring coherency between
-cleancache (shared or not), the page cache, and the filesystem, using
-cleancache flush operations as required.
-
-Note that cleancache must enforce put-put-get coherency and get-get
-coherency.  For the former, if two puts are made to the same handle but
-with different data, say AAA by the first put and BBB by the second, a
-subsequent get can never return the stale data (AAA).  For get-get coherency,
-if a get for a given handle fails, subsequent gets for that handle will
-never succeed unless preceded by a successful put with that handle.
-
-Last, cleancache provides no SMP serialization guarantees; if two
-different Linux threads are simultaneously putting and flushing a page
-with the same handle, the results are indeterminate.  Callers must
-lock the page to ensure serial behavior.
-
-CLEANCACHE PERFORMANCE METRICS
-
-Cleancache monitoring is done by sysfs files in the
-/sys/kernel/mm/cleancache directory.  The effectiveness of cleancache
-can be measured (across all filesystems) with:
-
-succ_gets	- number of gets that were successful
-failed_gets	- number of gets that failed
-puts		- number of puts attempted (all "succeed")
-flushes		- number of flushes attempted
-
-A backend implementatation may provide additional metrics.
-
-FAQ
-
-1) Where's the value? (Andrew Morton)
-
-Cleancache provides a significant performance benefit to many workloads
-in many environments with negligible overhead by improving the
-effectiveness of the pagecache.  Clean pagecache pages are
-saved in transcendent memory (RAM that is otherwise not directly
-addressable to the kernel); fetching those pages later avoids "refaults"
-and thus disk reads.
-
-Cleancache (and its sister code "frontswap") provide interfaces for
-this transcendent memory (aka "tmem"), which conceptually lies between
-fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
-Disallowing direct kernel or userland reads/writes to tmem
-is ideal when data is transformed to a different form and size (such
-as with compression) or secretly moved (as might be useful for write-
-balancing for some RAM-like devices).  Evicted page-cache pages (and
-swap pages) are a great use for this kind of slower-than-RAM-but-much-
-faster-than-disk transcendent memory, and the cleancache (and frontswap)
-"page-object-oriented" specification provides a nice way to read and
-write -- and indirectly "name" -- the pages.
-
-In the virtual case, the whole point of virtualization is to statistically
-multiplex physical resources across the varying demands of multiple
-virtual machines.  This is really hard to do with RAM and efforts to
-do it well with no kernel change have essentially failed (except in some
-well-publicized special-case workloads).  Cleancache -- and frontswap --
-with a fairly small impact on the kernel, provide a huge amount
-of flexibility for more dynamic, flexible RAM multiplexing.
-Specifically, the Xen Transcendent Memory backend allows otherwise
-"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
-virtual machines, but the pages can be compressed and deduplicated to
-optimize RAM utilization.  And when guest OS's are induced to surrender
-underutilized RAM (e.g. with "self-ballooning"), page cache pages
-are the first to go, and cleancache allows those pages to be
-saved and reclaimed if overall host system memory conditions allow.
-
-And the identical interface used for cleancache can be used in
-physical systems as well.  The zcache driver acts as a memory-hungry
-device that stores pages of data in a compressed state.  And
-the proposed "RAMster" driver shares RAM across multiple physical
-systems.
-
-2) Why does cleancache have its sticky fingers so deep inside the
-   filesystems and VFS? (Andrew Morton and Christoph Hellwig)
-
-The core hooks for cleancache in VFS are in most cases a single line
-and the minimum set are placed precisely where needed to maintain
-coherency (via cleancache_flush operations) between cleancache,
-the page cache, and disk.  All hooks compile into nothingness if
-cleancache is config'ed off and turn into a function-pointer-
-compare-to-NULL if config'ed on but no backend claims the ops
-functions, or to a compare-struct-element-to-negative if a
-backend claims the ops functions but a filesystem doesn't enable
-cleancache.
-
-Some filesystems are built entirely on top of VFS and the hooks
-in VFS are sufficient, so don't require an "init_fs" hook; the
-initial implementation of cleancache didn't provide this hook.
-But for some filesystems (such as btrfs), the VFS hooks are
-incomplete and one or more hooks in fs-specific code are required.
-And for some other filesystems, such as tmpfs, cleancache may
-be counterproductive.  So it seemed prudent to require a filesystem
-to "opt in" to use cleancache, which requires adding a hook in
-each filesystem.  Not all filesystems are supported by cleancache
-only because they haven't been tested.  The existing set should
-be sufficient to validate the concept, the opt-in approach means
-that untested filesystems are not affected, and the hooks in the
-existing filesystems should make it very easy to add more
-filesystems in the future.
-
-The total impact of the hooks to existing fs and mm files is only
-about 40 lines added (not counting comments and blank lines).
-
-3) Why not make cleancache asynchronous and batched so it can
-   more easily interface with real devices with DMA instead
-   of copying each individual page? (Minchan Kim)
-
-The one-page-at-a-time copy semantics simplifies the implementation
-on both the frontend and backend and also allows the backend to
-do fancy things on-the-fly like page compression and
-page deduplication.  And since the data is "gone" (copied into/out
-of the pageframe) before the cleancache get/put call returns,
-a great deal of race conditions and potential coherency issues
-are avoided.  While the interface seems odd for a "real device"
-or for real kernel-addressable RAM, it makes perfect sense for
-transcendent memory.
-
-4) Why is non-shared cleancache "exclusive"?  And where is the
-   page "flushed" after a "get"? (Minchan Kim)
-
-The main reason is to free up space in transcendent memory and
-to avoid unnecessary cleancache_flush calls.  If you want inclusive,
-the page can be "put" immediately following the "get".  If
-put-after-get for inclusive becomes common, the interface could
-be easily extended to add a "get_no_flush" call.
-
-The flush is done by the cleancache backend implementation.
-
-5) What's the performance impact?
-
-Performance analysis has been presented at OLS'09 and LCA'10.
-Briefly, performance gains can be significant on most workloads,
-especially when memory pressure is high (e.g. when RAM is
-overcommitted in a virtual workload); and because the hooks are
-invoked primarily in place of or in addition to a disk read/write,
-overhead is negligible even in worst case workloads.  Basically
-cleancache replaces I/O with memory-copy-CPU-overhead; on older
-single-core systems with slow memory-copy speeds, cleancache
-has little value, but in newer multicore machines, especially
-consolidated/virtualized machines, it has great value.
-
-6) How do I add cleancache support for filesystem X? (Boaz Harrash)
-
-Filesystems that are well-behaved and conform to certain
-restrictions can utilize cleancache simply by making a call to
-cleancache_init_fs at mount time.  Unusual, misbehaving, or
-poorly layered filesystems must either add additional hooks
-and/or undergo extensive additional testing... or should just
-not enable the optional cleancache.
-
-Some points for a filesystem to consider:
-
-- The FS should be block-device-based (e.g. a ram-based FS such
-  as tmpfs should not enable cleancache)
-- To ensure coherency/correctness, the FS must ensure that all
-  file removal or truncation operations either go through VFS or
-  add hooks to do the equivalent cleancache "flush" operations
-- To ensure coherency/correctness, either inode numbers must
-  be unique across the lifetime of the on-disk file OR the
-  FS must provide an "encode_fh" function.
-- The FS must call the VFS superblock alloc and deactivate routines
-  or add hooks to do the equivalent cleancache calls done there.
-- To maximize performance, all pages fetched from the FS should
-  go through the do_mpag_readpage routine or the FS should add
-  hooks to do the equivalent (cf. btrfs)
-- Currently, the FS blocksize must be the same as PAGESIZE.  This
-  is not an architectural restriction, but no backends currently
-  support anything different.
-- A clustered FS should invoke the "shared_init_fs" cleancache
-  hook to get best performance for some backends.
-
-7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
-
-If cleancache would use the inode virtual address instead of
-inode/filehandle, the pool id could be eliminated.  But, this
-won't work because cleancache retains pagecache data pages
-persistently even when the inode has been pruned from the
-inode unused list, and only flushes the data page if the file
-gets removed/truncated.  So if cleancache used the inode kva,
-there would be potential coherency issues if/when the inode
-kva is reused for a different file.  Alternately, if cleancache
-flushed the pages when the inode kva was freed, much of the value
-of cleancache would be lost because the cache of pages in cleanache
-is potentially much larger than the kernel pagecache and is most
-useful if the pages survive inode cache removal.
-
-8) Why is a global variable required?
-
-The cleancache_enabled flag is checked in all of the frequently-used
-cleancache hooks.  The alternative is a function call to check a static
-variable. Since cleancache is enabled dynamically at runtime, systems
-that don't enable cleancache would suffer thousands (possibly
-tens-of-thousands) of unnecessary function calls per second.  So the
-global variable allows cleancache to be enabled by default at compile
-time, but have insignificant performance impact when cleancache remains
-disabled at runtime.
-
-9) Does cleanache work with KVM?
-
-The memory model of KVM is sufficiently different that a cleancache
-backend may have less value for KVM.  This remains to be tested,
-especially in an overcommitted system.
-
-10) Does cleancache work in userspace?  It sounds useful for
-   memory hungry caches like web browsers.  (Jamie Lokier)
-
-No plans yet, though we agree it sounds useful, at least for
-apps that bypass the page cache (e.g. O_DIRECT).
-
-Last updated: Dan Magenheimer, April 13 2011
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index d54d551004f7..1ab17de642e5 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -3572,16 +3572,9 @@ M:	Andrew Morton <akpm@linux-foundation.org>
 M:	Jan Kara <jack@suse.cz>
 L:	linux-ext4@vger.kernel.org
 S:	Maintained
-F:	fs/jbd/
-F:	include/linux/ext3_jbd.h
-F:	include/linux/jbd.h
-
-JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
-M:	"Theodore Ts'o" <tytso@mit.edu>
-L:	linux-ext4@vger.kernel.org
-S:	Maintained
-F:	fs/jbd2/
-F:	include/linux/jbd2.h
+F:	fs/jbd*/
+F:	include/linux/ext*jbd*.h
+F:	include/linux/jbd*.h
 
 JSM Neo PCI based serial card
 M:	Breno Leitao <leitao@linux.vnet.ibm.com>
diff --git a/trunk/arch/powerpc/Kconfig b/trunk/arch/powerpc/Kconfig
index 423145a6f7ba..2f6a22e8e935 100644
--- a/trunk/arch/powerpc/Kconfig
+++ b/trunk/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
 	select HAVE_RCU_TABLE_FREE if SMP
+	select HAVE_SYSCALL_TRACEPOINTS
 
 config EARLY_PRINTK
 	bool
diff --git a/trunk/arch/powerpc/include/asm/ftrace.h b/trunk/arch/powerpc/include/asm/ftrace.h
index dde1296b8b41..169d039ed402 100644
--- a/trunk/arch/powerpc/include/asm/ftrace.h
+++ b/trunk/arch/powerpc/include/asm/ftrace.h
@@ -60,4 +60,18 @@ struct dyn_arch_ftrace {
 
 #endif
 
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64) && !defined(__ASSEMBLY__)
+#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+	/*
+	 * Compare the symbol name with the system call name. Skip the .sys or .SyS
+	 * prefix from the symbol name and the sys prefix from the system call name and
+	 * just match the rest. This is only needed on ppc64 since symbol names on
+	 * 32bit do not start with a period so the generic function will work.
+	 */
+	return !strcmp(sym + 4, name + 3);
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 && !__ASSEMBLY__ */
+
 #endif /* _ASM_POWERPC_FTRACE */
diff --git a/trunk/arch/powerpc/include/asm/syscall.h b/trunk/arch/powerpc/include/asm/syscall.h
index 23913e902fc3..b54b2add07be 100644
--- a/trunk/arch/powerpc/include/asm/syscall.h
+++ b/trunk/arch/powerpc/include/asm/syscall.h
@@ -15,6 +15,11 @@
 
 #include <linux/sched.h>
 
+/* ftrace syscalls requires exporting the sys_call_table */
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern const unsigned long *sys_call_table;
+#endif /* CONFIG_FTRACE_SYSCALLS */
+
 static inline long syscall_get_nr(struct task_struct *task,
 				  struct pt_regs *regs)
 {
diff --git a/trunk/arch/powerpc/include/asm/thread_info.h b/trunk/arch/powerpc/include/asm/thread_info.h
index 37c353e8af7c..836f231ec1f0 100644
--- a/trunk/arch/powerpc/include/asm/thread_info.h
+++ b/trunk/arch/powerpc/include/asm/thread_info.h
@@ -110,7 +110,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NOERROR		12	/* Force successful syscall return */
 #define TIF_NOTIFY_RESUME	13	/* callback before returning to user */
 #define TIF_FREEZE		14	/* Freezing for suspend */
-#define TIF_RUNLATCH		15	/* Is the runlatch enabled? */
+#define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
+#define TIF_RUNLATCH		16	/* Is the runlatch enabled? */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -127,8 +128,10 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
+#define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_RUNLATCH		(1<<TIF_RUNLATCH)
-#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
+#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
 				 _TIF_NOTIFY_RESUME)
diff --git a/trunk/arch/powerpc/kernel/Makefile b/trunk/arch/powerpc/kernel/Makefile
index 9aab36312572..e8b981897d44 100644
--- a/trunk/arch/powerpc/kernel/Makefile
+++ b/trunk/arch/powerpc/kernel/Makefile
@@ -109,6 +109,7 @@ obj-$(CONFIG_PPC_IO_WORKAROUNDS)	+= io-workarounds.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_callchain.o
 
 obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_event.o
diff --git a/trunk/arch/powerpc/kernel/ftrace.c b/trunk/arch/powerpc/kernel/ftrace.c
index ce1f3e44c24f..bf99cfa6bbfe 100644
--- a/trunk/arch/powerpc/kernel/ftrace.c
+++ b/trunk/arch/powerpc/kernel/ftrace.c
@@ -22,6 +22,7 @@
 #include <asm/cacheflush.h>
 #include <asm/code-patching.h>
 #include <asm/ftrace.h>
+#include <asm/syscall.h>
 
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -600,3 +601,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	}
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64)
+unsigned long __init arch_syscall_addr(int nr)
+{
+	return sys_call_table[nr*2];
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
diff --git a/trunk/arch/powerpc/kernel/ptrace.c b/trunk/arch/powerpc/kernel/ptrace.c
index a6ae1cfad86c..cb22024f2b42 100644
--- a/trunk/arch/powerpc/kernel/ptrace.c
+++ b/trunk/arch/powerpc/kernel/ptrace.c
@@ -29,6 +29,7 @@
 #include <linux/signal.h>
 #include <linux/seccomp.h>
 #include <linux/audit.h>
+#include <trace/syscall.h>
 #ifdef CONFIG_PPC32
 #include <linux/module.h>
 #endif
@@ -40,6 +41,9 @@
 #include <asm/pgtable.h>
 #include <asm/system.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
 /*
  * The parameter save area on the stack is used to store arguments being passed
  * to callee function and is located at fixed offset from stack pointer.
@@ -1710,6 +1714,9 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->gpr[0]);
+
 	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
 		if (!is_32bit_task())
@@ -1738,6 +1745,9 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
 				   regs->result);
 
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->result);
+
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall_exit(regs, step);
diff --git a/trunk/arch/x86/include/asm/xen/hypercall.h b/trunk/arch/x86/include/asm/xen/hypercall.h
index d240ea950519..8508bfe52296 100644
--- a/trunk/arch/x86/include/asm/xen/hypercall.h
+++ b/trunk/arch/x86/include/asm/xen/hypercall.h
@@ -447,13 +447,6 @@ HYPERVISOR_hvm_op(int op, void *arg)
        return _hypercall2(unsigned long, hvm_op, op, arg);
 }
 
-static inline int
-HYPERVISOR_tmem_op(
-	struct tmem_op *op)
-{
-	return _hypercall1(int, tmem_op, op);
-}
-
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/trunk/drivers/video/mb862xx/mb862xx-i2c.c b/trunk/drivers/video/mb862xx/mb862xx-i2c.c
index b953099edd8e..cb77d3b4657d 100644
--- a/trunk/drivers/video/mb862xx/mb862xx-i2c.c
+++ b/trunk/drivers/video/mb862xx/mb862xx-i2c.c
@@ -12,7 +12,6 @@
 #include <linux/fb.h>
 #include <linux/i2c.h>
 #include <linux/io.h>
-#include <linux/delay.h>
 
 #include "mb862xxfb.h"
 #include "mb862xx_reg.h"
diff --git a/trunk/drivers/xen/Makefile b/trunk/drivers/xen/Makefile
index bbc18258ecc5..4781f806701d 100644
--- a/trunk/drivers/xen/Makefile
+++ b/trunk/drivers/xen/Makefile
@@ -1,6 +1,5 @@
 obj-y	+= grant-table.o features.o events.o manage.o balloon.o
 obj-y	+= xenbus/
-obj-y	+= tmem.o
 
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o			:= $(nostackp)
diff --git a/trunk/drivers/xen/tmem.c b/trunk/drivers/xen/tmem.c
deleted file mode 100644
index 816a44959ef0..000000000000
--- a/trunk/drivers/xen/tmem.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Xen implementation for transcendent memory (tmem)
- *
- * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
- * Author: Dan Magenheimer
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/cleancache.h>
-
-#include <xen/xen.h>
-#include <xen/interface/xen.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/page.h>
-#include <asm/xen/hypervisor.h>
-
-#define TMEM_CONTROL               0
-#define TMEM_NEW_POOL              1
-#define TMEM_DESTROY_POOL          2
-#define TMEM_NEW_PAGE              3
-#define TMEM_PUT_PAGE              4
-#define TMEM_GET_PAGE              5
-#define TMEM_FLUSH_PAGE            6
-#define TMEM_FLUSH_OBJECT          7
-#define TMEM_READ                  8
-#define TMEM_WRITE                 9
-#define TMEM_XCHG                 10
-
-/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
-#define TMEM_POOL_PERSIST          1
-#define TMEM_POOL_SHARED           2
-#define TMEM_POOL_PAGESIZE_SHIFT   4
-#define TMEM_VERSION_SHIFT        24
-
-
-struct tmem_pool_uuid {
-	u64 uuid_lo;
-	u64 uuid_hi;
-};
-
-struct tmem_oid {
-	u64 oid[3];
-};
-
-#define TMEM_POOL_PRIVATE_UUID	{ 0, 0 }
-
-/* flags for tmem_ops.new_pool */
-#define TMEM_POOL_PERSIST          1
-#define TMEM_POOL_SHARED           2
-
-/* xen tmem foundation ops/hypercalls */
-
-static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
-	u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
-{
-	struct tmem_op op;
-	int rc = 0;
-
-	op.cmd = tmem_cmd;
-	op.pool_id = tmem_pool;
-	op.u.gen.oid[0] = oid.oid[0];
-	op.u.gen.oid[1] = oid.oid[1];
-	op.u.gen.oid[2] = oid.oid[2];
-	op.u.gen.index = index;
-	op.u.gen.tmem_offset = tmem_offset;
-	op.u.gen.pfn_offset = pfn_offset;
-	op.u.gen.len = len;
-	set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
-	rc = HYPERVISOR_tmem_op(&op);
-	return rc;
-}
-
-static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
-				u32 flags, unsigned long pagesize)
-{
-	struct tmem_op op;
-	int rc = 0, pageshift;
-
-	for (pageshift = 0; pagesize != 1; pageshift++)
-		pagesize >>= 1;
-	flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
-	flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
-	op.cmd = TMEM_NEW_POOL;
-	op.u.new.uuid[0] = uuid.uuid_lo;
-	op.u.new.uuid[1] = uuid.uuid_hi;
-	op.u.new.flags = flags;
-	rc = HYPERVISOR_tmem_op(&op);
-	return rc;
-}
-
-/* xen generic tmem ops */
-
-static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
-			     u32 index, unsigned long pfn)
-{
-	unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
-
-	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
-		gmfn, 0, 0, 0);
-}
-
-static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
-			     u32 index, unsigned long pfn)
-{
-	unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
-
-	return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
-		gmfn, 0, 0, 0);
-}
-
-static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
-{
-	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
-		0, 0, 0, 0);
-}
-
-static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
-{
-	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
-}
-
-static int xen_tmem_destroy_pool(u32 pool_id)
-{
-	struct tmem_oid oid = { { 0 } };
-
-	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
-}
-
-int tmem_enabled;
-
-static int __init enable_tmem(char *s)
-{
-	tmem_enabled = 1;
-	return 1;
-}
-
-__setup("tmem", enable_tmem);
-
-/* cleancache ops */
-
-static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
-				     pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-	unsigned long pfn = page_to_pfn(page);
-
-	if (pool < 0)
-		return;
-	if (ind != index)
-		return;
-	mb(); /* ensure page is quiescent; tmem may address it with an alias */
-	(void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
-}
-
-static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
-				    pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-	unsigned long pfn = page_to_pfn(page);
-	int ret;
-
-	/* translate return values to linux semantics */
-	if (pool < 0)
-		return -1;
-	if (ind != index)
-		return -1;
-	ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
-	if (ret == 1)
-		return 0;
-	else
-		return -1;
-}
-
-static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
-				       pgoff_t index)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (pool < 0)
-		return;
-	if (ind != index)
-		return;
-	(void)xen_tmem_flush_page((u32)pool, oid, ind);
-}
-
-static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
-{
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (pool < 0)
-		return;
-	(void)xen_tmem_flush_object((u32)pool, oid);
-}
-
-static void tmem_cleancache_flush_fs(int pool)
-{
-	if (pool < 0)
-		return;
-	(void)xen_tmem_destroy_pool((u32)pool);
-}
-
-static int tmem_cleancache_init_fs(size_t pagesize)
-{
-	struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
-
-	return xen_tmem_new_pool(uuid_private, 0, pagesize);
-}
-
-static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
-{
-	struct tmem_pool_uuid shared_uuid;
-
-	shared_uuid.uuid_lo = *(u64 *)uuid;
-	shared_uuid.uuid_hi = *(u64 *)(&uuid[8]);
-	return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
-}
-
-static int use_cleancache = 1;
-
-static int __init no_cleancache(char *s)
-{
-	use_cleancache = 0;
-	return 1;
-}
-
-__setup("nocleancache", no_cleancache);
-
-static struct cleancache_ops tmem_cleancache_ops = {
-	.put_page = tmem_cleancache_put_page,
-	.get_page = tmem_cleancache_get_page,
-	.flush_page = tmem_cleancache_flush_page,
-	.flush_inode = tmem_cleancache_flush_inode,
-	.flush_fs = tmem_cleancache_flush_fs,
-	.init_shared_fs = tmem_cleancache_init_shared_fs,
-	.init_fs = tmem_cleancache_init_fs
-};
-
-static int __init xen_tmem_init(void)
-{
-	struct cleancache_ops old_ops;
-
-	if (!xen_domain())
-		return 0;
-#ifdef CONFIG_CLEANCACHE
-	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
-	if (tmem_enabled && use_cleancache) {
-		char *s = "";
-		old_ops = cleancache_register_ops(&tmem_cleancache_ops);
-		if (old_ops.init_fs != NULL)
-			s = " (WARNING: cleancache_ops overridden)";
-		printk(KERN_INFO "cleancache enabled, RAM provided by "
-				 "Xen Transcendent Memory%s\n", s);
-	}
-#endif
-	return 0;
-}
-
-module_init(xen_tmem_init)
diff --git a/trunk/fs/9p/vfs_inode.c b/trunk/fs/9p/vfs_inode.c
index 8d7f3e69ae29..7f6c67703195 100644
--- a/trunk/fs/9p/vfs_inode.c
+++ b/trunk/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
-	dentry_unhash(d);
 	return v9fs_remove(i, d, 1);
 }
 
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct p9_fid *newdirfid;
 	struct p9_wstat wstat;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
 	retval = 0;
 	old_inode = old_dentry->d_inode;
diff --git a/trunk/fs/Kconfig b/trunk/fs/Kconfig
index 19891aab9c6e..979992dcb386 100644
--- a/trunk/fs/Kconfig
+++ b/trunk/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
 	def_bool n
 
 config EXPORTFS
-	tristate
+	bool
 
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
@@ -121,20 +121,6 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
-config TMPFS_POSIX_ACL
-	bool "Tmpfs POSIX Access Control Lists"
-	depends on TMPFS
-	select TMPFS_XATTR
-	select GENERIC_ACL
-	help
-	  POSIX Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the POSIX ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N.
-
 config TMPFS_XATTR
 	bool "Tmpfs extended attributes"
 	depends on TMPFS
@@ -147,9 +133,22 @@ config TMPFS_XATTR
 	  Currently this enables support for the trusted.* and
 	  security.* namespaces.
 
+	  If unsure, say N.
+
 	  You need this for POSIX ACL support on tmpfs.
 
-	  If unsure, say N.
+config TMPFS_POSIX_ACL
+	bool "Tmpfs POSIX Access Control Lists"
+	depends on TMPFS_XATTR
+	select GENERIC_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
 
 config HUGETLBFS
 	bool "HugeTLB file system support"
diff --git a/trunk/fs/affs/namei.c b/trunk/fs/affs/namei.c
index 03330e2e390c..e3e9efc1fdd8 100644
--- a/trunk/fs/affs/namei.c
+++ b/trunk/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
 		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	return affs_remove_header(dentry);
 }
 
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh = NULL;
 	int retval;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
 		 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
 		 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/trunk/fs/afs/dir.c b/trunk/fs/afs/dir.c
index 2c4e05160042..20c106f24927 100644
--- a/trunk/fs/afs/dir.c
+++ b/trunk/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	_enter("{%x:%u},{%s}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	ret = -ENAMETOOLONG;
 	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct key *key;
 	int ret;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	vnode = AFS_FS_I(old_dentry->d_inode);
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
diff --git a/trunk/fs/autofs4/root.c b/trunk/fs/autofs4/root.c
index 87d95a8cddbc..f55ae23b137e 100644
--- a/trunk/fs/autofs4/root.c
+++ b/trunk/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	dentry_unhash(dentry);
-
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
 		if (p_ino && dentry->d_parent != dentry)
diff --git a/trunk/fs/bfs/dir.c b/trunk/fs/bfs/dir.c
index c7d1d06b0483..b14cebfd9047 100644
--- a/trunk/fs/bfs/dir.c
+++ b/trunk/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct bfs_sb_info *info;
 	int error = -ENOENT;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_bh = new_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	if (S_ISDIR(old_inode->i_mode))
diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c
index 4f9893243dae..96fcfa522dab 100644
--- a/trunk/fs/btrfs/extent_io.c
+++ b/trunk/fs/btrfs/extent_io.c
@@ -11,7 +11,6 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
-#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
@@ -2017,13 +2016,6 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
 	set_page_extent_mapped(page);
 
-	if (!PageUptodate(page)) {
-		if (cleancache_get_page(page) == 0) {
-			BUG_ON(blocksize != PAGE_SIZE);
-			goto out;
-		}
-	}
-
 	end = page_end;
 	while (1) {
 		lock_extent(tree, start, end, GFP_NOFS);
@@ -2157,7 +2149,6 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		cur = cur + iosize;
 		page_offset += iosize;
 	}
-out:
 	if (!nr) {
 		if (!PageError(page))
 			SetPageUptodate(page);
diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c
index be4ffa12f3ef..0ac712efcdf2 100644
--- a/trunk/fs/btrfs/super.c
+++ b/trunk/fs/btrfs/super.c
@@ -39,7 +39,6 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <linux/cleancache.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -625,7 +624,6 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_root = root_dentry;
 
 	save_mount_options(sb, data);
-	cleancache_init_fs(sb);
 	return 0;
 
 fail_close:
diff --git a/trunk/fs/buffer.c b/trunk/fs/buffer.c
index 698c6b2cc462..a08bb8e61c6f 100644
--- a/trunk/fs/buffer.c
+++ b/trunk/fs/buffer.c
@@ -41,7 +41,6 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
-#include <linux/cleancache.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -270,10 +269,6 @@ void invalidate_bdev(struct block_device *bdev)
 	invalidate_bh_lrus();
 	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 	invalidate_mapping_pages(mapping, 0, -1);
-	/* 99% of the time, we don't need to flush the cleancache on the bdev.
-	 * But, for the strange corners, lets be cautious
-	 */
-	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
@@ -2336,26 +2331,24 @@ EXPORT_SYMBOL(block_commit_write);
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
- *
- * Direct callers of this function should call vfs_check_frozen() so that page
- * fault does not busyloop until the fs is thawed.
  */
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-			 get_block_t get_block)
+int
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		   get_block_t get_block)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
-	int ret;
+	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 
 	lock_page(page);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
-		/* We overload EFAULT to mean page got truncated */
-		ret = -EFAULT;
-		goto out_unlock;
+		/* page got truncated out from underneath us */
+		unlock_page(page);
+		goto out;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2368,40 +2361,17 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-	if (unlikely(ret < 0))
-		goto out_unlock;
-	/*
-	 * Freezing in progress? We check after the page is marked dirty and
-	 * with page lock held so if the test here fails, we are sure freezing
-	 * code will wait during syncing until the page fault is done - at that
-	 * point page will be dirty and unlocked so freezing code will write it
-	 * and writeprotect it again.
-	 */
-	set_page_dirty(page);
-	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-	return 0;
-out_unlock:
-	unlock_page(page);
-	return ret;
-}
-EXPORT_SYMBOL(__block_page_mkwrite);
-
-int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		   get_block_t get_block)
-{
-	int ret;
-	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+	if (unlikely(ret)) {
+		unlock_page(page);
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+	} else
+		ret = VM_FAULT_LOCKED;
 
-	/*
-	 * This check is racy but catches the common case. The check in
-	 * __block_page_mkwrite() is reliable.
-	 */
-	vfs_check_frozen(sb, SB_FREEZE_WRITE);
-	ret = __block_page_mkwrite(vma, vmf, get_block);
-	return block_page_mkwrite_return(ret);
+out:
+	return ret;
 }
 EXPORT_SYMBOL(block_page_mkwrite);
 
diff --git a/trunk/fs/coda/dir.c b/trunk/fs/coda/dir.c
index a46126fd5735..2b8dae4d121e 100644
--- a/trunk/fs/coda/dir.c
+++ b/trunk/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	int len = de->d_name.len;
 	int error;
 
-	dentry_unhash(de);
-
 	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
 	if (!error) {
 		/* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int new_length = new_dentry->d_name.len;
 	int error;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
 			     coda_i2f(new_dir), old_length, new_length,
 			     (const char *) old_name, (const char *)new_name);
diff --git a/trunk/fs/configfs/dir.c b/trunk/fs/configfs/dir.c
index 9d17d350abc5..9a37a9b6de3a 100644
--- a/trunk/fs/configfs/dir.c
+++ b/trunk/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
 	int ret;
 
-	dentry_unhash(dentry);
-
 	if (dentry->d_parent == configfs_sb->s_root)
 		return -EPERM;
 
diff --git a/trunk/fs/ecryptfs/inode.c b/trunk/fs/ecryptfs/inode.c
index 227b409b8406..4d4cc6a90cd5 100644
--- a/trunk/fs/ecryptfs/inode.c
+++ b/trunk/fs/ecryptfs/inode.c
@@ -521,8 +521,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct dentry *lower_dir_dentry;
 	int rc;
 
-	dentry_unhash(dentry);
-
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
@@ -573,9 +571,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dir_dentry;
 	struct dentry *trap = NULL;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
 	dget(lower_old_dentry);
diff --git a/trunk/fs/ext3/super.c b/trunk/fs/ext3/super.c
index aad153ef6b78..3c6a9e0eadc1 100644
--- a/trunk/fs/ext3/super.c
+++ b/trunk/fs/ext3/super.c
@@ -36,7 +36,6 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/log2.h>
-#include <linux/cleancache.h>
 
 #include <asm/uaccess.h>
 
@@ -1368,7 +1367,6 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 	} else {
 		ext3_msg(sb, KERN_INFO, "using internal journal");
 	}
-	cleancache_init_fs(sb);
 	return res;
 }
 
diff --git a/trunk/fs/ext4/Makefile b/trunk/fs/ext4/Makefile
index 04109460ba9e..c947e36eda6c 100644
--- a/trunk/fs/ext4/Makefile
+++ b/trunk/fs/ext4/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-		mmp.o
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/trunk/fs/ext4/balloc.c b/trunk/fs/ext4/balloc.c
index 264f6949511e..1c67139ad4b4 100644
--- a/trunk/fs/ext4/balloc.c
+++ b/trunk/fs/ext4/balloc.c
@@ -361,6 +361,130 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	return bh;
 }
 
+/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physcial block to add to the block group
+ * @count:			number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	ext4_grpblk_t bit;
+	unsigned int i;
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int err = 0, ret, blk_free_count;
+	ext4_grpblk_t blocks_freed;
+	struct ext4_group_info *grp;
+
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	grp = ext4_get_group_info(sb, block_group);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		goto error_return;
+	}
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!desc)
+		goto error_return;
+
+	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, desc),
+		     sbi->s_itb_per_group)) {
+		ext4_error(sb, "Adding blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
+		goto error_return;
+	}
+
+	/*
+	 * We are about to add blocks to the bitmap,
+	 * so we need undo access.
+	 */
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext4_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+	/*
+	 * make sure we don't allow a parallel init on other groups in the
+	 * same buddy cache
+	 */
+	down_write(&grp->alloc_sem);
+	for (i = 0, blocks_freed = 0; i < count; i++) {
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+						bit + i, bitmap_bh->b_data)) {
+			ext4_error(sb, "bit already cleared for block %llu",
+				   (ext4_fsblk_t)(block + i));
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			blocks_freed++;
+		}
+	}
+	ext4_lock_group(sb, block_group);
+	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+	ext4_free_blks_set(sb, desc, blk_free_count);
+	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+	ext4_unlock_group(sb, block_group);
+	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+	}
+	/*
+	 * request to reload the buddy with the
+	 * new bitmap information
+	 */
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+	grp->bb_free += blocks_freed;
+	up_write(&grp->alloc_sem);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
+
 /**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
@@ -369,8 +493,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
  * Check if filesystem has nblocks free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
-				s64 nblocks, unsigned int flags)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -384,6 +507,11 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
 						EXT4_FREEBLOCKS_WATERMARK) {
 		free_blocks  = percpu_counter_sum_positive(fbc);
 		dirty_blocks = percpu_counter_sum_positive(dbc);
+		if (dirty_blocks < 0) {
+			printk(KERN_CRIT "Dirty block accounting "
+					"went wrong %lld\n",
+					(long long)dirty_blocks);
+		}
 	}
 	/* Check whether we have space after
 	 * accounting for current dirty blocks & root reserved blocks.
@@ -394,9 +522,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
 	/* Hm, nope.  Are (enough) root reserved blocks available? */
 	if (sbi->s_resuid == current_fsuid() ||
 	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
-	    capable(CAP_SYS_RESOURCE) ||
-		(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
-
+	    capable(CAP_SYS_RESOURCE)) {
 		if (free_blocks >= (nblocks + dirty_blocks))
 			return 1;
 	}
@@ -405,9 +531,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
 }
 
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-			   s64 nblocks, unsigned int flags)
+						s64 nblocks)
 {
-	if (ext4_has_free_blocks(sbi, nblocks, flags)) {
+	if (ext4_has_free_blocks(sbi, nblocks)) {
 		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
 		return 0;
 	} else
@@ -428,7 +554,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
 	    (*retries)++ > 3 ||
 	    !EXT4_SB(sb)->s_journal)
 		return 0;
@@ -451,8 +577,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * error stores in errp pointer
  */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-				  ext4_fsblk_t goal, unsigned int flags,
-				  unsigned long *count, int *errp)
+		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
@@ -462,7 +587,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	ar.inode = inode;
 	ar.goal = goal;
 	ar.len = count ? *count : 1;
-	ar.flags = flags;
 
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	if (count)
diff --git a/trunk/fs/ext4/ext4.h b/trunk/fs/ext4/ext4.h
index a74b89c09f90..4daaf2b753f4 100644
--- a/trunk/fs/ext4/ext4.h
+++ b/trunk/fs/ext4/ext4.h
@@ -108,8 +108,7 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_DELALLOC_RESERVED	0x0400
 /* We are doing stream allocation */
 #define EXT4_MB_STREAM_ALLOC		0x0800
-/* Use reserved root blocks if needed */
-#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
+
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -210,8 +209,6 @@ struct ext4_io_submit {
  */
 #define	EXT4_BAD_INO		 1	/* Bad blocks inode */
 #define EXT4_ROOT_INO		 2	/* Root inode */
-#define EXT4_USR_QUOTA_INO	 3	/* User quota inode */
-#define EXT4_GRP_QUOTA_INO	 4	/* Group quota inode */
 #define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
 #define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
 #define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
@@ -515,10 +512,6 @@ struct ext4_new_group_data {
 	/* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
 					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-	/* Punch out blocks of an extent */
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
-	/* Don't normalize allocation size (used for fallocate) */
-#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 
 /*
  * Flags used by ext4_free_blocks
@@ -1035,7 +1028,7 @@ struct ext4_super_block {
 	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__le32	s_flags;		/* Miscellaneous flags */
 	__le16  s_raid_stride;		/* RAID stride */
-	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
+	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1151,9 +1144,6 @@ struct ext4_sb_info {
 	unsigned long s_ext_blocks;
 	unsigned long s_ext_extents;
 #endif
-	/* ext4 extent cache stats */
-	unsigned long extent_cache_hits;
-	unsigned long extent_cache_misses;
 
 	/* for buddy allocator */
 	struct ext4_group_info ***s_group_info;
@@ -1211,9 +1201,6 @@ struct ext4_sb_info {
 	struct ext4_li_request *s_li_request;
 	/* Wait multiplier for lazy initialization thread */
 	unsigned int s_li_wait_mult;
-
-	/* Kernel thread for multiple mount protection */
-	struct task_struct *s_mmp_tsk;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1351,7 +1338,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
-#define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1365,29 +1351,13 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
 
-#define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
-#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
-					 EXT4_FEATURE_INCOMPAT_META_BG)
-#define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
-					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-
-#define EXT3_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
-#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
-					 EXT4_FEATURE_INCOMPAT_RECOVER| \
-					 EXT4_FEATURE_INCOMPAT_META_BG)
-#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
-					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-
 #define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_RECOVER| \
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
-					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-					 EXT4_FEATURE_INCOMPAT_MMP)
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1620,6 +1590,12 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
  */
 struct ext4_lazy_init {
 	unsigned long		li_state;
+
+	wait_queue_head_t	li_wait_daemon;
+	wait_queue_head_t	li_wait_task;
+	struct timer_list	li_timer;
+	struct task_struct	*li_task;
+
 	struct list_head	li_request_list;
 	struct mutex		li_list_mtx;
 };
@@ -1638,67 +1614,6 @@ struct ext4_features {
 	struct completion f_kobj_unregister;
 };
 
-/*
- * This structure will be used for multiple mount protection. It will be
- * written into the block number saved in the s_mmp_block field in the
- * superblock. Programs that check MMP should assume that if
- * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
- * to use the filesystem, regardless of how old the timestamp is.
- */
-#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
-#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
-#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
-#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
-
-struct mmp_struct {
-	__le32	mmp_magic;		/* Magic number for MMP */
-	__le32	mmp_seq;		/* Sequence no. updated periodically */
-
-	/*
-	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
-	 * purposes and do not affect the correctness of the algorithm
-	 */
-	__le64	mmp_time;		/* Time last updated */
-	char	mmp_nodename[64];	/* Node which last updated MMP block */
-	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
-
-	/*
-	 * mmp_check_interval is used to verify if the MMP block has been
-	 * updated on the block device. The value is updated based on the
-	 * maximum time to write the MMP block during an update cycle.
-	 */
-	__le16	mmp_check_interval;
-
-	__le16	mmp_pad1;
-	__le32	mmp_pad2[227];
-};
-
-/* arguments passed to the mmp thread */
-struct mmpd_data {
-	struct buffer_head *bh; /* bh from initial read_mmp_block() */
-	struct super_block *sb;  /* super block of the fs */
-};
-
-/*
- * Check interval multiplier
- * The MMP block is written every update interval and initially checked every
- * update interval x the multiplier (the value is then adapted based on the
- * write latency). The reason is that writes can be delayed under load and we
- * don't want readers to incorrectly assume that the filesystem is no longer
- * in use.
- */
-#define EXT4_MMP_CHECK_MULT		2UL
-
-/*
- * Minimum interval for MMP checking in seconds.
- */
-#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
-
-/*
- * Maximum interval for MMP checking in seconds.
- */
-#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
-
 /*
  * Function prototypes
  */
@@ -1723,12 +1638,10 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-					 ext4_fsblk_t goal,
-					 unsigned int flags,
-					 unsigned long *count,
-					 int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-				  s64 nblocks, unsigned int flags);
+			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1793,8 +1706,6 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-				ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 
 /* inode.c */
@@ -1818,7 +1729,6 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1828,8 +1738,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1880,10 +1788,6 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
 						       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
-			   const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
-						       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
 				    struct super_block *, ext4_group_t, \
 				    unsigned long, ext4_fsblk_t, \
@@ -2160,8 +2064,6 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
-extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
-				loff_t length);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2190,9 +2092,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       int len,
 			       struct writeback_control *wbc);
 
-/* mmp.c */
-extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
-
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
diff --git a/trunk/fs/ext4/ext4_jbd2.c b/trunk/fs/ext4/ext4_jbd2.c
index f5240aa15601..6e272ef6ba96 100644
--- a/trunk/fs/ext4/ext4_jbd2.c
+++ b/trunk/fs/ext4/ext4_jbd2.c
@@ -6,6 +6,20 @@
 
 #include <trace/events/ext4.h>
 
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
+				   handle_t *handle, struct buffer_head *bh)
+{
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_undo_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__, bh,
+						  handle, err);
+	}
+	return err;
+}
+
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
 				    handle_t *handle, struct buffer_head *bh)
 {
diff --git a/trunk/fs/ext4/ext4_jbd2.h b/trunk/fs/ext4/ext4_jbd2.h
index bb85757689b6..d0f53538a57f 100644
--- a/trunk/fs/ext4/ext4_jbd2.h
+++ b/trunk/fs/ext4/ext4_jbd2.h
@@ -126,6 +126,9 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
 			       const char *err_fn,
 		struct buffer_head *bh, handle_t *handle, int err);
 
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
+				   handle_t *handle, struct buffer_head *bh);
+
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
 				    handle_t *handle, struct buffer_head *bh);
 
@@ -143,6 +146,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
 			      handle_t *handle, struct super_block *sb);
 
+#define ext4_journal_get_undo_access(handle, bh) \
+	__ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
 	__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/trunk/fs/ext4/extents.c b/trunk/fs/ext4/extents.c
index 5199bac7fc62..4890d6f3ad15 100644
--- a/trunk/fs/ext4/extents.c
+++ b/trunk/fs/ext4/extents.c
@@ -46,13 +46,6 @@
 
 #include <trace/events/ext4.h>
 
-static int ext4_split_extent(handle_t *handle,
-				struct inode *inode,
-				struct ext4_ext_path *path,
-				struct ext4_map_blocks *map,
-				int split_flag,
-				int flags);
-
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -199,13 +192,12 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 static ext4_fsblk_t
 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
-			struct ext4_extent *ex, int *err, unsigned int flags)
+			struct ext4_extent *ex, int *err)
 {
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
-					NULL, err);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
 	return newblock;
 }
 
@@ -482,43 +474,9 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	}
 	ext_debug("\n");
 }
-
-static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
-			ext4_fsblk_t newblock, int level)
-{
-	int depth = ext_depth(inode);
-	struct ext4_extent *ex;
-
-	if (depth != level) {
-		struct ext4_extent_idx *idx;
-		idx = path[level].p_idx;
-		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
-			ext_debug("%d: move %d:%llu in new index %llu\n", level,
-					le32_to_cpu(idx->ei_block),
-					ext4_idx_pblock(idx),
-					newblock);
-			idx++;
-		}
-
-		return;
-	}
-
-	ex = path[depth].p_ext;
-	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
-		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
-				le32_to_cpu(ex->ee_block),
-				ext4_ext_pblock(ex),
-				ext4_ext_is_uninitialized(ex),
-				ext4_ext_get_actual_len(ex),
-				newblock);
-		ex++;
-	}
-}
-
 #else
 #define ext4_ext_show_path(inode, path)
 #define ext4_ext_show_leaf(inode, path)
-#define ext4_ext_show_move(inode, path, newblock, level)
 #endif
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -834,14 +792,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
  * - initializes subtree
  */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
-			  unsigned int flags,
-			  struct ext4_ext_path *path,
-			  struct ext4_extent *newext, int at)
+				struct ext4_ext_path *path,
+				struct ext4_extent *newext, int at)
 {
 	struct buffer_head *bh = NULL;
 	int depth = ext_depth(inode);
 	struct ext4_extent_header *neh;
 	struct ext4_extent_idx *fidx;
+	struct ext4_extent *ex;
 	int i = at, k, m, a;
 	ext4_fsblk_t newblock, oldblock;
 	__le32 border;
@@ -889,7 +847,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
 		newblock = ext4_ext_new_meta_block(handle, inode, path,
-						   newext, &err, flags);
+						   newext, &err);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -918,6 +876,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	neh->eh_depth = 0;
+	ex = EXT_FIRST_EXTENT(neh);
 
 	/* move remainder of path[depth] to the new leaf */
 	if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -929,12 +888,25 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 	/* start copy from next extent */
-	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
-	ext4_ext_show_move(inode, path, newblock, depth);
+	/* TODO: we could do it by single memmove */
+	m = 0;
+	path[depth].p_ext++;
+	while (path[depth].p_ext <=
+			EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+				le32_to_cpu(path[depth].p_ext->ee_block),
+				ext4_ext_pblock(path[depth].p_ext),
+				ext4_ext_is_uninitialized(path[depth].p_ext),
+				ext4_ext_get_actual_len(path[depth].p_ext),
+				newblock);
+		/*memmove(ex++, path[depth].p_ext++,
+				sizeof(struct ext4_extent));
+		neh->eh_entries++;*/
+		path[depth].p_ext++;
+		m++;
+	}
 	if (m) {
-		struct ext4_extent *ex;
-		ex = EXT_FIRST_EXTENT(neh);
-		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
+		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
 		le16_add_cpu(&neh->eh_entries, m);
 	}
 
@@ -996,8 +968,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
 				i, newblock, le32_to_cpu(border), oldblock);
+		/* copy indexes */
+		m = 0;
+		path[i].p_idx++;
 
-		/* move remainder of path[i] to the new index block */
+		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
 		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
 					EXT_LAST_INDEX(path[i].p_hdr))) {
 			EXT4_ERROR_INODE(inode,
@@ -1006,13 +982,20 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			err = -EIO;
 			goto cleanup;
 		}
-		/* start copy indexes */
-		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
-		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
-				EXT_MAX_INDEX(path[i].p_hdr));
-		ext4_ext_show_move(inode, path, newblock, i);
+		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+			ext_debug("%d: move %d:%llu in new index %llu\n", i,
+					le32_to_cpu(path[i].p_idx->ei_block),
+					ext4_idx_pblock(path[i].p_idx),
+					newblock);
+			/*memmove(++fidx, path[i].p_idx++,
+					sizeof(struct ext4_extent_idx));
+			neh->eh_entries++;
+			BUG_ON(neh->eh_entries > neh->eh_max);*/
+			path[i].p_idx++;
+			m++;
+		}
 		if (m) {
-			memmove(++fidx, path[i].p_idx,
+			memmove(++fidx, path[i].p_idx - m,
 				sizeof(struct ext4_extent_idx) * m);
 			le16_add_cpu(&neh->eh_entries, m);
 		}
@@ -1073,9 +1056,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
  *   just created block
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-				 unsigned int flags,
-				 struct ext4_ext_path *path,
-				 struct ext4_extent *newext)
+					struct ext4_ext_path *path,
+					struct ext4_extent *newext)
 {
 	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
@@ -1083,8 +1065,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_meta_block(handle, inode, path,
-		newext, &err, flags);
+	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
 	if (newblock == 0)
 		return err;
 
@@ -1159,9 +1140,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
  * if no free index is found, then it requests in-depth growing.
  */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-				    unsigned int flags,
-				    struct ext4_ext_path *path,
-				    struct ext4_extent *newext)
+					struct ext4_ext_path *path,
+					struct ext4_extent *newext)
 {
 	struct ext4_ext_path *curp;
 	int depth, i, err = 0;
@@ -1181,7 +1161,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
 	if (EXT_HAS_FREE_INDEX(curp)) {
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
-		err = ext4_ext_split(handle, inode, flags, path, newext, i);
+		err = ext4_ext_split(handle, inode, path, newext, i);
 		if (err)
 			goto out;
 
@@ -1194,8 +1174,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, flags,
-					    path, newext);
+		err = ext4_ext_grow_indepth(handle, inode, path, newext);
 		if (err)
 			goto out;
 
@@ -1584,7 +1563,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
  * 1 if they got merged.
  */
-static int ext4_ext_try_to_merge_right(struct inode *inode,
+static int ext4_ext_try_to_merge(struct inode *inode,
 				 struct ext4_ext_path *path,
 				 struct ext4_extent *ex)
 {
@@ -1623,31 +1602,6 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 	return merge_done;
 }
 
-/*
- * This function tries to merge the @ex extent to neighbours in the tree.
- * return 1 if merge left else 0.
- */
-static int ext4_ext_try_to_merge(struct inode *inode,
-				  struct ext4_ext_path *path,
-				  struct ext4_extent *ex) {
-	struct ext4_extent_header *eh;
-	unsigned int depth;
-	int merge_done = 0;
-	int ret = 0;
-
-	depth = ext_depth(inode);
-	BUG_ON(path[depth].p_hdr == NULL);
-	eh = path[depth].p_hdr;
-
-	if (ex > EXT_FIRST_EXTENT(eh))
-		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
-
-	if (!merge_done)
-		ret = ext4_ext_try_to_merge_right(inode, path, ex);
-
-	return ret;
-}
-
 /*
  * check if a portion of the "newext" extent overlaps with an
  * existing extent.
@@ -1714,7 +1668,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	int depth, len, err;
 	ext4_lblk_t next;
 	unsigned uninitialized = 0;
-	int flags = 0;
 
 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1789,9 +1742,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	 * There is no free space in the found leaf.
 	 * We're gonna add a new leaf in the tree.
 	 */
-	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
-		flags = EXT4_MB_USE_ROOT_BLOCKS;
-	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
 	if (err)
 		goto cleanup;
 	depth = ext_depth(inode);
@@ -2052,25 +2003,13 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 
 /*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * cache extent pointer.  If the cached extent is a hole,
- * this routine should be used instead of
- * ext4_ext_in_cache if the calling function needs to
- * know the size of the hole.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex:    Pointer where the cached extent will be stored
- *         if it contains block
- *
  * Return 0 if cache is invalid; 1 if the cache is valid
  */
-static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
-	struct ext4_ext_cache *ex){
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+			struct ext4_extent *ex)
+{
 	struct ext4_ext_cache *cex;
-	struct ext4_sb_info *sbi;
 	int ret = 0;
 
 	/*
@@ -2078,59 +2017,25 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
 	 */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
-	sbi = EXT4_SB(inode->i_sb);
 
 	/* has cache valid data? */
 	if (cex->ec_len == 0)
 		goto errout;
 
 	if (in_range(block, cex->ec_block, cex->ec_len)) {
-		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+		ex->ee_block = cpu_to_le32(cex->ec_block);
+		ext4_ext_store_pblock(ex, cex->ec_start);
+		ex->ee_len = cpu_to_le16(cex->ec_len);
 		ext_debug("%u cached by %u:%u:%llu\n",
 				block,
 				cex->ec_block, cex->ec_len, cex->ec_start);
 		ret = 1;
 	}
 errout:
-	if (!ret)
-		sbi->extent_cache_misses++;
-	else
-		sbi->extent_cache_hits++;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	return ret;
 }
 
-/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex:    Pointer where the cached extent will be stored
- *         if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-			struct ext4_extent *ex)
-{
-	struct ext4_ext_cache cex;
-	int ret = 0;
-
-	if (ext4_ext_check_cache(inode, block, &cex)) {
-		ex->ee_block = cpu_to_le32(cex.ec_block);
-		ext4_ext_store_pblock(ex, cex.ec_start);
-		ex->ee_len = cpu_to_le16(cex.ec_len);
-		ret = 1;
-	}
-
-	return ret;
-}
-
-
 /*
  * ext4_ext_rm_idx:
  * removes index from the index block.
@@ -2258,16 +2163,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		ext4_free_blocks(handle, inode, NULL, start, num, flags);
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		/* head removal */
-		ext4_lblk_t num;
-		ext4_fsblk_t start;
-
-		num = to - from;
-		start = ext4_ext_pblock(ex);
-
-		ext_debug("free first %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, 0, start, num, flags);
-
+		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
+			from, to, le32_to_cpu(ex->ee_block), ee_len);
 	} else {
 		printk(KERN_INFO "strange request: removal(2) "
 				"%u-%u from %u:%u\n",
@@ -2276,22 +2173,9 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
-
-/*
- * ext4_ext_rm_leaf() Removes the extents associated with the
- * blocks appearing between "start" and "end", and splits the extents
- * if "start" and "end" appear in the same extent
- *
- * @handle: The journal handle
- * @inode:  The files inode
- * @path:   The path to the leaf
- * @start:  The first block to remove
- * @end:   The last block to remove
- */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path, ext4_lblk_t start,
-		ext4_lblk_t end)
+		struct ext4_ext_path *path, ext4_lblk_t start)
 {
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
@@ -2302,7 +2186,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
-	struct ext4_map_blocks map;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf\n", start);
@@ -2332,95 +2215,31 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
-		b = ex_ee_block+ex_ee_len - 1 < end ?
-			ex_ee_block+ex_ee_len - 1 : end;
+		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
 
 		ext_debug("  border %u:%u\n", a, b);
 
-		/* If this extent is beyond the end of the hole, skip it */
-		if (end <= ex_ee_block) {
-			ex--;
-			ex_ee_block = le32_to_cpu(ex->ee_block);
-			ex_ee_len = ext4_ext_get_actual_len(ex);
-			continue;
-		} else if (a != ex_ee_block &&
-			b != ex_ee_block + ex_ee_len - 1) {
-			/*
-			 * If this is a truncate, then this condition should
-			 * never happen because at least one of the end points
-			 * needs to be on the edge of the extent.
-			 */
-			if (end == EXT_MAX_BLOCK) {
-				ext_debug("  bad truncate %u:%u\n",
-						start, end);
-				block = 0;
-				num = 0;
-				err = -EIO;
-				goto out;
-			}
-			/*
-			 * else this is a hole punch, so the extent needs to
-			 * be split since neither edge of the hole is on the
-			 * extent edge
-			 */
-			else{
-				map.m_pblk = ext4_ext_pblock(ex);
-				map.m_lblk = ex_ee_block;
-				map.m_len = b - ex_ee_block;
-
-				err = ext4_split_extent(handle,
-					inode, path, &map, 0,
-					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-					EXT4_GET_BLOCKS_PRE_IO);
-
-				if (err < 0)
-					goto out;
-
-				ex_ee_len = ext4_ext_get_actual_len(ex);
-
-				b = ex_ee_block+ex_ee_len - 1 < end ?
-					ex_ee_block+ex_ee_len - 1 : end;
-
-				/* Then remove tail of this extent */
-				block = ex_ee_block;
-				num = a - block;
-			}
+		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+			block = 0;
+			num = 0;
+			BUG();
 		} else if (a != ex_ee_block) {
 			/* remove tail of the extent */
 			block = ex_ee_block;
 			num = a - block;
 		} else if (b != ex_ee_block + ex_ee_len - 1) {
 			/* remove head of the extent */
-			block = b;
-			num =  ex_ee_block + ex_ee_len - b;
-
-			/*
-			 * If this is a truncate, this condition
-			 * should never happen
-			 */
-			if (end == EXT_MAX_BLOCK) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
+			block = a;
+			num = b - a;
+			/* there is no "make a hole" API yet */
+			BUG();
 		} else {
 			/* remove whole extent: excellent! */
 			block = ex_ee_block;
 			num = 0;
-			if (a != ex_ee_block) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
-
-			if (b != ex_ee_block + ex_ee_len - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
+			BUG_ON(a != ex_ee_block);
+			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
 		}
 
 		/*
@@ -2451,13 +2270,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (num == 0) {
 			/* this extent is removed; mark slot entirely unused */
 			ext4_ext_store_pblock(ex, 0);
-		} else if (block != ex_ee_block) {
-			/*
-			 * If this was a head removal, then we need to update
-			 * the physical block since it is now at a different
-			 * location
-			 */
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
+			le16_add_cpu(&eh->eh_entries, -1);
 		}
 
 		ex->ee_block = cpu_to_le32(block);
@@ -2473,27 +2286,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		/*
-		 * If the extent was completely released,
-		 * we need to remove it from the leaf
-		 */
-		if (num == 0) {
-			if (end != EXT_MAX_BLOCK) {
-				/*
-				 * For hole punching, we need to scoot all the
-				 * extents up when an extent is removed so that
-				 * we dont have blank extents in the middle
-				 */
-				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
-					sizeof(struct ext4_extent));
-
-				/* Now get rid of the one at the end */
-				memset(EXT_LAST_EXTENT(eh), 0,
-					sizeof(struct ext4_extent));
-			}
-			le16_add_cpu(&eh->eh_entries, -1);
-		}
-
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
 				ext4_ext_pblock(ex));
 		ex--;
@@ -2534,8 +2326,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 	return 1;
 }
 
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
-				ext4_lblk_t end)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
@@ -2574,8 +2365,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
 			/* this is leaf block */
-			err = ext4_ext_rm_leaf(handle, inode, path,
-					start, end);
+			err = ext4_ext_rm_leaf(handle, inode, path, start);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -2739,195 +2529,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 	return ret;
 }
 
-/*
- * used by extent splitting.
- */
-#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
-					due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
-
-/*
- * ext4_split_extent_at() splits an extent at given block.
- *
- * @handle: the journal handle
- * @inode: the file inode
- * @path: the path to the extent
- * @split: the logical block where the extent is splitted.
- * @split_flags: indicates if the extent could be zeroout if split fails, and
- *		 the states(init or uninit) of new extents.
- * @flags: flags used to insert new extent to extent tree.
- *
- *
- * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are deterimined by split_flag.
- *
- * There are two cases:
- *  a> the extent are splitted into two extent.
- *  b> split is not needed, and just mark the extent.
- *
- * return 0 on success.
- */
-static int ext4_split_extent_at(handle_t *handle,
-			     struct inode *inode,
-			     struct ext4_ext_path *path,
-			     ext4_lblk_t split,
-			     int split_flag,
-			     int flags)
-{
-	ext4_fsblk_t newblock;
-	ext4_lblk_t ee_block;
-	struct ext4_extent *ex, newex, orig_ex;
-	struct ext4_extent *ex2 = NULL;
-	unsigned int ee_len, depth;
-	int err = 0;
-
-	ext_debug("ext4_split_extents_at: inode %lu, logical"
-		"block %llu\n", inode->i_ino, (unsigned long long)split);
-
-	ext4_ext_show_leaf(inode, path);
-
-	depth = ext_depth(inode);
-	ex = path[depth].p_ext;
-	ee_block = le32_to_cpu(ex->ee_block);
-	ee_len = ext4_ext_get_actual_len(ex);
-	newblock = split - ee_block + ext4_ext_pblock(ex);
-
-	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
-
-	err = ext4_ext_get_access(handle, inode, path + depth);
-	if (err)
-		goto out;
-
-	if (split == ee_block) {
-		/*
-		 * case b: block @split is the block that the extent begins with
-		 * then we just change the state of the extent, and splitting
-		 * is not needed.
-		 */
-		if (split_flag & EXT4_EXT_MARK_UNINIT2)
-			ext4_ext_mark_uninitialized(ex);
-		else
-			ext4_ext_mark_initialized(ex);
-
-		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
-			ext4_ext_try_to_merge(inode, path, ex);
-
-		err = ext4_ext_dirty(handle, inode, path + depth);
-		goto out;
-	}
-
-	/* case a */
-	memcpy(&orig_ex, ex, sizeof(orig_ex));
-	ex->ee_len = cpu_to_le16(split - ee_block);
-	if (split_flag & EXT4_EXT_MARK_UNINIT1)
-		ext4_ext_mark_uninitialized(ex);
-
-	/*
-	 * path may lead to new leaf, not to original leaf any more
-	 * after ext4_ext_insert_extent() returns,
-	 */
-	err = ext4_ext_dirty(handle, inode, path + depth);
-	if (err)
-		goto fix_extent_len;
-
-	ex2 = &newex;
-	ex2->ee_block = cpu_to_le32(split);
-	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
-	ext4_ext_store_pblock(ex2, newblock);
-	if (split_flag & EXT4_EXT_MARK_UNINIT2)
-		ext4_ext_mark_uninitialized(ex2);
-
-	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
-		err = ext4_ext_zeroout(inode, &orig_ex);
-		if (err)
-			goto fix_extent_len;
-		/* update the extent length and mark as initialized */
-		ex->ee_len = cpu_to_le32(ee_len);
-		ext4_ext_try_to_merge(inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + depth);
-		goto out;
-	} else if (err)
-		goto fix_extent_len;
-
-out:
-	ext4_ext_show_leaf(inode, path);
-	return err;
-
-fix_extent_len:
-	ex->ee_len = orig_ex.ee_len;
-	ext4_ext_dirty(handle, inode, path + depth);
-	return err;
-}
-
-/*
- * ext4_split_extents() splits an extent and mark extent which is covered
- * by @map as split_flags indicates
- *
- * It may result in splitting the extent into multiple extents (upto three)
- * There are three possibilities:
- *   a> There is no split required
- *   b> Splits in two extents: Split is happening at either end of the extent
- *   c> Splits in three extents: Somone is splitting in middle of the extent
- *
- */
-static int ext4_split_extent(handle_t *handle,
-			      struct inode *inode,
-			      struct ext4_ext_path *path,
-			      struct ext4_map_blocks *map,
-			      int split_flag,
-			      int flags)
-{
-	ext4_lblk_t ee_block;
-	struct ext4_extent *ex;
-	unsigned int ee_len, depth;
-	int err = 0;
-	int uninitialized;
-	int split_flag1, flags1;
-
-	depth = ext_depth(inode);
-	ex = path[depth].p_ext;
-	ee_block = le32_to_cpu(ex->ee_block);
-	ee_len = ext4_ext_get_actual_len(ex);
-	uninitialized = ext4_ext_is_uninitialized(ex);
-
-	if (map->m_lblk + map->m_len < ee_block + ee_len) {
-		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
-			      EXT4_EXT_MAY_ZEROOUT : 0;
-		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
-		if (uninitialized)
-			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
-				       EXT4_EXT_MARK_UNINIT2;
-		err = ext4_split_extent_at(handle, inode, path,
-				map->m_lblk + map->m_len, split_flag1, flags1);
-		if (err)
-			goto out;
-	}
-
-	ext4_ext_drop_refs(path);
-	path = ext4_ext_find_extent(inode, map->m_lblk, path);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-
-	if (map->m_lblk >= ee_block) {
-		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
-			      EXT4_EXT_MAY_ZEROOUT : 0;
-		if (uninitialized)
-			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
-		if (split_flag & EXT4_EXT_MARK_UNINIT2)
-			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
-		err = ext4_split_extent_at(handle, inode, path,
-				map->m_lblk, split_flag1, flags);
-		if (err)
-			goto out;
-	}
-
-	ext4_ext_show_leaf(inode, path);
-out:
-	return err ? err : map->m_len;
-}
-
 #define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2944,13 +2545,17 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
-	struct ext4_map_blocks split_map;
-	struct ext4_extent zero_ex;
-	struct ext4_extent *ex;
+	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex1 = NULL;
+	struct ext4_extent *ex2 = NULL;
+	struct ext4_extent *ex3 = NULL;
+	struct ext4_extent_header *eh;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
+	ext4_fsblk_t newblock;
 	int err = 0;
-	int split_flag = 0;
+	int ret = 0;
+	int may_zeroout;
 
 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2962,86 +2567,280 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
+
+	ex2 = ex;
+	orig_ex.ee_block = ex->ee_block;
+	orig_ex.ee_len   = cpu_to_le16(ee_len);
+	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
-	WARN_ON(map->m_lblk < ee_block);
 	/*
 	 * It is safe to convert extent to initialized via explicit
 	 * zeroout only if extent is fully insde i_size or new_size.
 	 */
-	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+	may_zeroout = ee_block + ee_len <= eof_block;
 
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
-		err = ext4_ext_zeroout(inode, ex);
-		if (err)
-			goto out;
-
-		err = ext4_ext_get_access(handle, inode, path + depth);
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
-			goto out;
-		ext4_ext_mark_initialized(ex);
-		ext4_ext_try_to_merge(inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + depth);
-		goto out;
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = orig_ex.ee_block;
+		ex->ee_len   = orig_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		/* zeroed the full extent */
+		return allocated;
 	}
 
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
 	/*
-	 * four cases:
-	 * 1. split the extent into three extents.
-	 * 2. split the extent into two extents, zeroout the first half.
-	 * 3. split the extent into two extents, zeroout the second half.
-	 * 4. split the extent into two extents with out zeroout.
+	 * for sanity, update the length of the ex2 extent before
+	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
+	 * overlap of blocks.
 	 */
-	split_map.m_lblk = map->m_lblk;
-	split_map.m_len = map->m_len;
-
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
+	/* ex3: to ee_block + ee_len : uninitialised */
 	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
-		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
-			/* case 3 */
-			zero_ex.ee_block =
-					 cpu_to_le32(map->m_lblk);
-			zero_ex.ee_len = cpu_to_le16(allocated);
-			ext4_ext_store_pblock(&zero_ex,
-				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
-			err = ext4_ext_zeroout(inode, &zero_ex);
-			if (err)
-				goto out;
-			split_map.m_lblk = map->m_lblk;
-			split_map.m_len = allocated;
-		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
-			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
-			/* case 2 */
-			if (map->m_lblk != ee_block) {
-				zero_ex.ee_block = ex->ee_block;
-				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
-							ee_block);
-				ext4_ext_store_pblock(&zero_ex,
-						      ext4_ext_pblock(ex));
-				err = ext4_ext_zeroout(inode, &zero_ex);
-				if (err)
-					goto out;
-			}
+		unsigned int newdepth;
+		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
+		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
+			/*
+			 * map->m_lblk == ee_block is handled by the zerouout
+			 * at the beginning.
+			 * Mark first half uninitialized.
+			 * Mark second half initialized and zero out the
+			 * initialized extent
+			 */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = cpu_to_le16(ee_len - allocated);
+			ext4_ext_mark_uninitialized(ex);
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+
+			ex3 = &newex;
+			ex3->ee_block = cpu_to_le32(map->m_lblk);
+			ext4_ext_store_pblock(ex3, newblock);
+			ex3->ee_len = cpu_to_le16(allocated);
+			err = ext4_ext_insert_extent(handle, inode, path,
+							ex3, 0);
+			if (err == -ENOSPC) {
+				err =  ext4_ext_zeroout(inode, &orig_ex);
+				if (err)
+					goto fix_extent_len;
+				ex->ee_block = orig_ex.ee_block;
+				ex->ee_len   = orig_ex.ee_len;
+				ext4_ext_store_pblock(ex,
+					ext4_ext_pblock(&orig_ex));
+				ext4_ext_dirty(handle, inode, path + depth);
+				/* blocks available from map->m_lblk */
+				return allocated;
+
+			} else if (err)
+				goto fix_extent_len;
+
+			/*
+			 * We need to zero out the second half because
+			 * an fallocate request can update file size and
+			 * converting the second half to initialized extent
+			 * implies that we can leak some junk data to user
+			 * space.
+			 */
+			err =  ext4_ext_zeroout(inode, ex3);
+			if (err) {
+				/*
+				 * We should actually mark the
+				 * second half as uninit and return error
+				 * Insert would have changed the extent
+				 */
+				depth = ext_depth(inode);
+				ext4_ext_drop_refs(path);
+				path = ext4_ext_find_extent(inode, map->m_lblk,
+							    path);
+				if (IS_ERR(path)) {
+					err = PTR_ERR(path);
+					return err;
+				}
+				/* get the second half extent details */
+				ex = path[depth].p_ext;
+				err = ext4_ext_get_access(handle, inode,
+								path + depth);
+				if (err)
+					return err;
+				ext4_ext_mark_uninitialized(ex);
+				ext4_ext_dirty(handle, inode, path + depth);
+				return err;
+			}
 
-			split_map.m_lblk = ee_block;
-			split_map.m_len = map->m_lblk - ee_block + map->m_len;
-			allocated = map->m_len;
+			/* zeroed the second half */
+			return allocated;
 		}
-	}
+		ex3 = &newex;
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
+		ext4_ext_mark_uninitialized(ex3);
+		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
+		if (err == -ENOSPC && may_zeroout) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = orig_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			/* zeroed the full extent */
+			/* blocks available from map->m_lblk */
+			return allocated;
+
+		} else if (err)
+			goto fix_extent_len;
+		/*
+		 * The depth, and hence eh & ex might change
+		 * as part of the insert above.
+		 */
+		newdepth = ext_depth(inode);
+		/*
+		 * update the extent length after successful insert of the
+		 * split extent
+		 */
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
+		depth = newdepth;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+		eh = path[depth].p_hdr;
+		ex = path[depth].p_ext;
+		if (ex2 != &newex)
+			ex2 = ex;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
 
-	allocated = ext4_split_extent(handle, inode, path,
-				       &split_map, split_flag, 0);
-	if (allocated < 0)
-		err = allocated;
+		allocated = map->m_len;
 
+		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
+		 * to insert a extent in the middle zerout directly
+		 * otherwise give the extent a chance to merge to left
+		 */
+		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
+			map->m_lblk != ee_block && may_zeroout) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = orig_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			/* zero out the first half */
+			/* blocks available from map->m_lblk */
+			return allocated;
+		}
+	}
+	/*
+	 * If there was a change of depth as part of the
+	 * insertion of ex3 above, we need to update the length
+	 * of the ex1 extent again here
+	 */
+	if (ex1 && ex1 != ex) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
+	ext4_ext_store_pblock(ex2, newblock);
+	ex2->ee_len = cpu_to_le16(allocated);
+	if (ex2 != ex)
+		goto insert;
+	/*
+	 * New (initialized) extent starts from the first block
+	 * in the current extent. i.e., ex2 == ex
+	 * We have to see if it can be merged with the extent
+	 * on the left.
+	 */
+	if (ex2 > EXT_FIRST_EXTENT(eh)) {
+		/*
+		 * To merge left, pass "ex2 - 1" to try_to_merge(),
+		 * since it merges towards right _only_.
+		 */
+		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+			depth = ext_depth(inode);
+			ex2--;
+		}
+	}
+	/*
+	 * Try to Merge towards right. This might be required
+	 * only when the whole extent is being written to.
+	 * i.e. ex2 == ex and ex3 == NULL.
+	 */
+	if (!ex3) {
+		ret = ext4_ext_try_to_merge(inode, path, ex2);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+		}
+	}
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	goto out;
+insert:
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+	if (err == -ENOSPC && may_zeroout) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = orig_ex.ee_block;
+		ex->ee_len   = orig_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		/* zero out the first half */
+		return allocated;
+	} else if (err)
+		goto fix_extent_len;
 out:
+	ext4_ext_show_leaf(inode, path);
 	return err ? err : allocated;
+
+fix_extent_len:
+	ex->ee_block = orig_ex.ee_block;
+	ex->ee_len   = orig_ex.ee_len;
+	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+	ext4_ext_mark_uninitialized(ex);
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
 }
 
 /*
@@ -3072,11 +2871,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 					struct ext4_ext_path *path,
 					int flags)
 {
-	ext4_lblk_t eof_block;
-	ext4_lblk_t ee_block;
-	struct ext4_extent *ex;
-	unsigned int ee_len;
-	int split_flag = 0, depth;
+	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex1 = NULL;
+	struct ext4_extent *ex2 = NULL;
+	struct ext4_extent *ex3 = NULL;
+	ext4_lblk_t ee_block, eof_block;
+	unsigned int allocated, ee_len, depth;
+	ext4_fsblk_t newblock;
+	int err = 0;
+	int may_zeroout;
 
 	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
@@ -3086,22 +2889,156 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
 		eof_block = map->m_lblk + map->m_len;
-	/*
-	 * It is safe to convert extent to initialized via explicit
-	 * zeroout only if extent is fully insde i_size or new_size.
-	 */
+
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
-	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-	split_flag |= EXT4_EXT_MARK_UNINIT2;
+	ex2 = ex;
+	orig_ex.ee_block = ex->ee_block;
+	orig_ex.ee_len   = cpu_to_le16(ee_len);
+	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
-	flags |= EXT4_GET_BLOCKS_PRE_IO;
-	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
-}
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
+	/*
+ 	 * If the uninitialized extent begins at the same logical
+ 	 * block where the write begins, and the write completely
+ 	 * covers the extent, then we don't need to split it.
+ 	 */
+	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
+		return allocated;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/*
+	 * for sanity, update the length of the ex2 extent before
+	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
+	 * overlap of blocks.
+	 */
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
+	/* ex3: to ee_block + ee_len : uninitialised */
+	if (allocated > map->m_len) {
+		unsigned int newdepth;
+		ex3 = &newex;
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
+		ext4_ext_mark_uninitialized(ex3);
+		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+		if (err == -ENOSPC && may_zeroout) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = orig_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			/* zeroed the full extent */
+			/* blocks available from map->m_lblk */
+			return allocated;
+
+		} else if (err)
+			goto fix_extent_len;
+		/*
+		 * The depth, and hence eh & ex might change
+		 * as part of the insert above.
+		 */
+		newdepth = ext_depth(inode);
+		/*
+		 * update the extent length after successful insert of the
+		 * split extent
+		 */
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
+		depth = newdepth;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+		ex = path[depth].p_ext;
+		if (ex2 != &newex)
+			ex2 = ex;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		allocated = map->m_len;
+	}
+	/*
+	 * If there was a change of depth as part of the
+	 * insertion of ex3 above, we need to update the length
+	 * of the ex1 extent again here
+	 */
+	if (ex1 && ex1 != ex) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/*
+	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
+	 * using direct I/O, uninitialised still.
+	 */
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
+	ext4_ext_store_pblock(ex2, newblock);
+	ex2->ee_len = cpu_to_le16(allocated);
+	ext4_ext_mark_uninitialized(ex2);
+	if (ex2 != ex)
+		goto insert;
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	ext_debug("out here\n");
+	goto out;
+insert:
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+	if (err == -ENOSPC && may_zeroout) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = orig_ex.ee_block;
+		ex->ee_len   = orig_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		/* zero out the first half */
+		return allocated;
+	} else if (err)
+		goto fix_extent_len;
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err ? err : allocated;
 
+fix_extent_len:
+	ex->ee_block = orig_ex.ee_block;
+	ex->ee_len   = orig_ex.ee_len;
+	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
+	ext4_ext_mark_uninitialized(ex);
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
+}
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 					      struct inode *inode,
 					      struct ext4_ext_path *path)
@@ -3110,27 +3047,46 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 	struct ext4_extent_header *eh;
 	int depth;
 	int err = 0;
+	int ret = 0;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 
-	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
-		"block %llu, max_blocks %u\n", inode->i_ino,
-		(unsigned long long)le32_to_cpu(ex->ee_block),
-		ext4_ext_get_actual_len(ex));
-
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
 	/* first mark the extent as initialized */
 	ext4_ext_mark_initialized(ex);
 
-	/* note: ext4_ext_correct_indexes() isn't needed here because
-	 * borders are not changed
+	/*
+	 * We have to see if it can be merged with the extent
+	 * on the left.
 	 */
-	ext4_ext_try_to_merge(inode, path, ex);
-
+	if (ex > EXT_FIRST_EXTENT(eh)) {
+		/*
+		 * To merge left, pass "ex - 1" to try_to_merge(),
+		 * since it merges towards right _only_.
+		 */
+		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+			depth = ext_depth(inode);
+			ex--;
+		}
+	}
+	/*
+	 * Try to Merge towards right.
+	 */
+	ret = ext4_ext_try_to_merge(inode, path, ex);
+	if (ret) {
+		err = ext4_ext_correct_indexes(handle, inode, path);
+		if (err)
+			goto out;
+		depth = ext_depth(inode);
+	}
 	/* Mark modified extent as dirty */
 	err = ext4_ext_dirty(handle, inode, path + depth);
 out:
@@ -3346,19 +3302,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock = 0;
 	int err = 0, depth, ret;
 	unsigned int allocated = 0;
-	unsigned int punched_out = 0;
-	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-	struct ext4_map_blocks punch_map;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 		  map->m_lblk, map->m_len, inode->i_ino);
 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* check in cache */
-	if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
-		((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
+	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
@@ -3423,84 +3375,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
 				  ee_block, ee_len, newblock);
 
-			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-				/*
-				 * Do not put uninitialized extent
-				 * in the cache
-				 */
-				if (!ext4_ext_is_uninitialized(ex)) {
-					ext4_ext_put_in_cache(inode, ee_block,
-						ee_len, ee_start);
-					goto out;
-				}
-				ret = ext4_ext_handle_uninitialized_extents(
-					handle, inode, map, path, flags,
-					allocated, newblock);
-				return ret;
-			}
-
-			/*
-			 * Punch out the map length, but only to the
-			 * end of the extent
-			 */
-			punched_out = allocated < map->m_len ?
-				allocated : map->m_len;
-
-			/*
-			 * Sense extents need to be converted to
-			 * uninitialized, they must fit in an
-			 * uninitialized extent
-			 */
-			if (punched_out > EXT_UNINIT_MAX_LEN)
-				punched_out = EXT_UNINIT_MAX_LEN;
-
-			punch_map.m_lblk = map->m_lblk;
-			punch_map.m_pblk = newblock;
-			punch_map.m_len = punched_out;
-			punch_map.m_flags = 0;
-
-			/* Check to see if the extent needs to be split */
-			if (punch_map.m_len != ee_len ||
-				punch_map.m_lblk != ee_block) {
-
-				ret = ext4_split_extent(handle, inode,
-				path, &punch_map, 0,
-				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-				EXT4_GET_BLOCKS_PRE_IO);
-
-				if (ret < 0) {
-					err = ret;
-					goto out2;
-				}
-				/*
-				 * find extent for the block at
-				 * the start of the hole
-				 */
-				ext4_ext_drop_refs(path);
-				kfree(path);
-
-				path = ext4_ext_find_extent(inode,
-				map->m_lblk, NULL);
-				if (IS_ERR(path)) {
-					err = PTR_ERR(path);
-					path = NULL;
-					goto out2;
-				}
-
-				depth = ext_depth(inode);
-				ex = path[depth].p_ext;
-				ee_len = ext4_ext_get_actual_len(ex);
-				ee_block = le32_to_cpu(ex->ee_block);
-				ee_start = ext4_ext_pblock(ex);
-
+			/* Do not put uninitialized extent in the cache */
+			if (!ext4_ext_is_uninitialized(ex)) {
+				ext4_ext_put_in_cache(inode, ee_block,
+							ee_len, ee_start);
+				goto out;
 			}
-
-			ext4_ext_mark_uninitialized(ex);
-
-			err = ext4_ext_remove_space(inode, map->m_lblk,
-				map->m_lblk + punched_out);
-
-			goto out2;
+			ret = ext4_ext_handle_uninitialized_extents(handle,
+					inode, map, path, flags, allocated,
+					newblock);
+			return ret;
 		}
 	}
 
@@ -3562,8 +3446,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	else
 		/* disable in-core preallocation for non-regular files */
 		ar.flags = 0;
-	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
-		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
@@ -3647,11 +3529,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	}
 	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
 		newblock, map->m_len, err ? err : allocated);
-
-	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
-			punched_out : allocated;
-
-	return err ? err : result;
+	return err ? err : allocated;
 }
 
 void ext4_ext_truncate(struct inode *inode)
@@ -3699,7 +3577,7 @@ void ext4_ext_truncate(struct inode *inode)
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
-	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
+	err = ext4_ext_remove_space(inode, last_block);
 
 	/* In a multi-transaction truncate, we only make the final
 	 * transaction synchronous.
@@ -3707,9 +3585,8 @@ void ext4_ext_truncate(struct inode *inode)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	up_write(&EXT4_I(inode)->i_data_sem);
-
 out_stop:
+	up_write(&EXT4_I(inode)->i_data_sem);
 	/*
 	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
@@ -3774,6 +3651,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
 	/*
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
@@ -3781,13 +3662,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
-	/* Return error if mode is not supported */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-		return -EOPNOTSUPP;
-
-	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return ext4_punch_hole(file, offset, len);
-
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	map.m_lblk = offset >> blkbits;
 	/*
@@ -3817,8 +3691,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 			break;
 		}
 		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-				      EXT4_GET_BLOCKS_NO_NORMALIZE);
+				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
@@ -3949,7 +3822,6 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		pgoff_t		last_offset;
 		pgoff_t		offset;
 		pgoff_t		index;
-		pgoff_t		start_index = 0;
 		struct page	**pages = NULL;
 		struct buffer_head *bh = NULL;
 		struct buffer_head *head = NULL;
@@ -3976,57 +3848,39 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 				kfree(pages);
 				return EXT_CONTINUE;
 			}
-			index = 0;
 
-next_page:
 			/* Try to find the 1st mapped buffer. */
-			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
+			end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
 				  blksize_bits;
-			if (!page_has_buffers(pages[index]))
+			if (!page_has_buffers(pages[0]))
 				goto out;
-			head = page_buffers(pages[index]);
+			head = page_buffers(pages[0]);
 			if (!head)
 				goto out;
 
-			index++;
 			bh = head;
 			do {
-				if (end >= newex->ec_block +
-					newex->ec_len)
-					/* The buffer is out of
-					 * the request range.
-					 */
-					goto out;
-
-				if (buffer_mapped(bh) &&
-				    end >= newex->ec_block) {
-					start_index = index - 1;
+				if (buffer_mapped(bh)) {
 					/* get the 1st mapped buffer. */
+					if (end > newex->ec_block +
+						newex->ec_len)
+						/* The buffer is out of
+						 * the request range.
+						 */
+						goto out;
 					goto found_mapped_buffer;
 				}
-
 				bh = bh->b_this_page;
 				end++;
 			} while (bh != head);
 
-			/* No mapped buffer in the range found in this page,
-			 * We need to look up next page.
-			 */
-			if (index >= ret) {
-				/* There is no page left, but we need to limit
-				 * newex->ec_len.
-				 */
-				newex->ec_len = end - newex->ec_block;
-				goto out;
-			}
-			goto next_page;
+			/* No mapped buffer found. */
+			goto out;
 		} else {
 			/*Find contiguous delayed buffers. */
 			if (ret > 0 && pages[0]->index == last_offset)
 				head = page_buffers(pages[0]);
 			bh = head;
-			index = 1;
-			start_index = 0;
 		}
 
 found_mapped_buffer:
@@ -4049,7 +3903,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 				end++;
 			} while (bh != head);
 
-			for (; index < ret; index++) {
+			for (index = 1; index < ret; index++) {
 				if (!page_has_buffers(pages[index])) {
 					bh = NULL;
 					break;
@@ -4059,10 +3913,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 					bh = NULL;
 					break;
 				}
-
 				if (pages[index]->index !=
-				    pages[start_index]->index + index
-				    - start_index) {
+					pages[0]->index + index) {
 					/* Blocks are not contiguous. */
 					bh = NULL;
 					break;
@@ -4154,177 +4006,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
 	return (error < 0 ? error : 0);
 }
 
-/*
- * ext4_ext_punch_hole
- *
- * Punches a hole of "length" bytes in a file starting
- * at byte "offset"
- *
- * @inode:  The inode of the file to punch a hole in
- * @offset: The starting byte offset of the hole
- * @length: The length of the hole
- *
- * Returns the number of blocks removed or negative on err
- */
-int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct super_block *sb = inode->i_sb;
-	struct ext4_ext_cache cache_ex;
-	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
-	struct address_space *mapping = inode->i_mapping;
-	struct ext4_map_blocks map;
-	handle_t *handle;
-	loff_t first_block_offset, last_block_offset, block_len;
-	loff_t first_page, last_page, first_page_offset, last_page_offset;
-	int ret, credits, blocks_released, err = 0;
-
-	first_block = (offset + sb->s_blocksize - 1) >>
-		EXT4_BLOCK_SIZE_BITS(sb);
-	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
-	first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
-	last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
-
-	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
-	first_page_offset = first_page << PAGE_CACHE_SHIFT;
-	last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
-	/*
-	 * Write out all dirty pages to avoid race conditions
-	 * Then release them.
-	 */
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		err = filemap_write_and_wait_range(mapping,
-			first_page_offset == 0 ? 0 : first_page_offset-1,
-			last_page_offset);
-
-			if (err)
-				return err;
-	}
-
-	/* Now release the pages */
-	if (last_page_offset > first_page_offset) {
-		truncate_inode_pages_range(mapping, first_page_offset,
-					   last_page_offset-1);
-	}
-
-	/* finish any pending end_io work */
-	ext4_flush_completed_IO(inode);
-
-	credits = ext4_writepage_trans_blocks(inode);
-	handle = ext4_journal_start(inode, credits);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	err = ext4_orphan_add(handle, inode);
-	if (err)
-		goto out;
-
-	/*
-	 * Now we need to zero out the un block aligned data.
-	 * If the file is smaller than a block, just
-	 * zero out the middle
-	 */
-	if (first_block > last_block)
-		ext4_block_zero_page_range(handle, mapping, offset, length);
-	else {
-		/* zero out the head of the hole before the first block */
-		block_len  = first_block_offset - offset;
-		if (block_len > 0)
-			ext4_block_zero_page_range(handle, mapping,
-						   offset, block_len);
-
-		/* zero out the tail of the hole after the last block */
-		block_len = offset + length - last_block_offset;
-		if (block_len > 0) {
-			ext4_block_zero_page_range(handle, mapping,
-					last_block_offset, block_len);
-		}
-	}
-
-	/* If there are no blocks to remove, return now */
-	if (first_block >= last_block)
-		goto out;
-
-	down_write(&EXT4_I(inode)->i_data_sem);
-	ext4_ext_invalidate_cache(inode);
-	ext4_discard_preallocations(inode);
-
-	/*
-	 * Loop over all the blocks and identify blocks
-	 * that need to be punched out
-	 */
-	iblock = first_block;
-	blocks_released = 0;
-	while (iblock < last_block) {
-		max_blocks = last_block - iblock;
-		num_blocks = 1;
-		memset(&map, 0, sizeof(map));
-		map.m_lblk = iblock;
-		map.m_len = max_blocks;
-		ret = ext4_ext_map_blocks(handle, inode, &map,
-			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
-
-		if (ret > 0) {
-			blocks_released += ret;
-			num_blocks = ret;
-		} else if (ret == 0) {
-			/*
-			 * If map blocks could not find the block,
-			 * then it is in a hole.  If the hole was
-			 * not already cached, then map blocks should
-			 * put it in the cache.  So we can get the hole
-			 * out of the cache
-			 */
-			memset(&cache_ex, 0, sizeof(cache_ex));
-			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
-				!cache_ex.ec_start) {
-
-				/* The hole is cached */
-				num_blocks = cache_ex.ec_block +
-				cache_ex.ec_len - iblock;
-
-			} else {
-				/* The block could not be identified */
-				err = -EIO;
-				break;
-			}
-		} else {
-			/* Map blocks error */
-			err = ret;
-			break;
-		}
-
-		if (num_blocks == 0) {
-			/* This condition should never happen */
-			ext_debug("Block lookup failed");
-			err = -EIO;
-			break;
-		}
-
-		iblock += num_blocks;
-	}
-
-	if (blocks_released > 0) {
-		ext4_ext_invalidate_cache(inode);
-		ext4_discard_preallocations(inode);
-	}
-
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-
-	up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
-	ext4_orphan_del(handle, inode);
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-	ext4_journal_stop(handle);
-	return err;
-}
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
@@ -4361,3 +4042,4 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 	return error;
 }
+
diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c
index 2c0972322009..7b80d543b89e 100644
--- a/trunk/fs/ext4/file.c
+++ b/trunk/fs/ext4/file.c
@@ -272,6 +272,7 @@ const struct file_operations ext4_file_operations = {
 };
 
 const struct inode_operations ext4_file_inode_operations = {
+	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
 #ifdef CONFIG_EXT4_FS_XATTR
diff --git a/trunk/fs/ext4/fsync.c b/trunk/fs/ext4/fsync.c
index ce66d2fe826c..e9473cbe80df 100644
--- a/trunk/fs/ext4/fsync.c
+++ b/trunk/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
 
 static void dump_completed_IO(struct inode * inode)
 {
-#ifdef	EXT4FS_DEBUG
+#ifdef	EXT4_DEBUG
 	struct list_head *cur, *before, *after;
 	ext4_io_end_t *io, *io0, *io1;
 	unsigned long flags;
@@ -172,7 +172,6 @@ int ext4_sync_file(struct file *file, int datasync)
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret;
 	tid_t commit_tid;
-	bool needs_barrier = false;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
@@ -212,12 +211,22 @@ int ext4_sync_file(struct file *file, int datasync)
 	}
 
 	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-	if (journal->j_flags & JBD2_BARRIER &&
-	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
-		needs_barrier = true;
-	jbd2_log_start_commit(journal, commit_tid);
-	ret = jbd2_log_wait_commit(journal, commit_tid);
-	if (needs_barrier)
+	if (jbd2_log_start_commit(journal, commit_tid)) {
+		/*
+		 * When the journal is on a different device than the
+		 * fs data disk, we need to issue the barrier in
+		 * writeback mode.  (In ordered mode, the jbd2 layer
+		 * will take care of issuing the barrier.  In
+		 * data=journal, all of the data blocks are written to
+		 * the journal device.)
+		 */
+		if (ext4_should_writeback_data(inode) &&
+		    (journal->j_fs_dev != journal->j_dev) &&
+		    (journal->j_flags & JBD2_BARRIER))
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+					NULL);
+		ret = jbd2_log_wait_commit(journal, commit_tid);
+	} else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
  out:
 	trace_ext4_sync_file_exit(inode, ret);
diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c
index 50d0e9c64584..f2fa5e8a582c 100644
--- a/trunk/fs/ext4/inode.c
+++ b/trunk/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_meta_blocks(handle, inode, goal,
-						     0, &count, err);
+		current_block = ext4_new_meta_blocks(handle, inode,
+							goal, &count, err);
 		if (*err)
 			goto failed_out;
 
@@ -1930,7 +1930,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 * We do still charge estimated metadata to the sb though;
 	 * we cannot afford to run out of free blocks.
 	 */
-	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
+	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
 		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
@@ -2796,7 +2796,9 @@ static int write_cache_pages_da(struct address_space *mapping,
 				continue;
 			}
 
-			wait_on_page_writeback(page);
+			if (PageWriteback(page))
+				wait_on_page_writeback(page);
+
 			BUG_ON(PageWriteback(page));
 
 			if (mpd->next_page != page->index)
@@ -3511,7 +3513,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 			loff_t end = offset + iov_length(iov, nr_segs);
 
 			if (end > isize)
-				ext4_truncate_failed_write(inode);
+				vmtruncate(inode, isize);
 		}
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3913,31 +3915,10 @@ void ext4_set_aops(struct inode *inode)
  */
 int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
-{
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned length;
-	unsigned blocksize;
-	struct inode *inode = mapping->host;
-
-	blocksize = inode->i_sb->s_blocksize;
-	length = blocksize - (offset & (blocksize - 1));
-
-	return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, max, pos;
+	unsigned blocksize, length, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
@@ -3950,15 +3931,7 @@ int ext4_block_zero_page_range(handle_t *handle,
 		return -EINVAL;
 
 	blocksize = inode->i_sb->s_blocksize;
-	max = blocksize - (offset & (blocksize - 1));
-
-	/*
-	 * correct length if it does not fall between
-	 * 'from' and the end of the block
-	 */
-	if (length > max || length < 0)
-		length = max;
-
+	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
 	if (!page_has_buffers(page))
@@ -4407,6 +4380,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 
 int ext4_can_truncate(struct inode *inode)
 {
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
@@ -4416,31 +4391,6 @@ int ext4_can_truncate(struct inode *inode)
 	return 0;
 }
 
-/*
- * ext4_punch_hole: punches a hole in a file by releaseing the blocks
- * associated with the given offset and length
- *
- * @inode:  File inode
- * @offset: The offset where the hole will begin
- * @len:    The length of the hole
- *
- * Returns: 0 on sucess or negative on failure
- */
-
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	if (!S_ISREG(inode->i_mode))
-		return -ENOTSUPP;
-
-	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-		/* TODO: Add support for non extent hole punching */
-		return -ENOTSUPP;
-	}
-
-	return ext4_ext_punch_hole(file, offset, length);
-}
-
 /*
  * ext4_truncate()
  *
@@ -4667,7 +4617,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
-	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
 	inode_offset = ((inode->i_ino - 1) %
 			EXT4_INODES_PER_GROUP(sb));
 	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5361,7 +5311,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
-	    (attr->ia_size < inode->i_size)) {
+	    (attr->ia_size < inode->i_size ||
+	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
 		handle_t *handle;
 
 		handle = ext4_journal_start(inode, 3);
@@ -5395,16 +5346,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 			}
 		}
-	}
-
-	if (attr->ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
-			truncate_setsize(inode, attr->ia_size);
-			ext4_truncate(inode);
-		} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		/* ext4_truncate will clear the flag */
+		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
 			ext4_truncate(inode);
 	}
 
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		rc = vmtruncate(inode, attr->ia_size);
+
 	if (!rc) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
@@ -5861,19 +5811,15 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_unlock;
 	}
 	ret = 0;
-
-	lock_page(page);
-	wait_on_page_writeback(page);
-	if (PageMappedToDisk(page)) {
-		up_read(&inode->i_alloc_sem);
-		return VM_FAULT_LOCKED;
-	}
+	if (PageMappedToDisk(page))
+		goto out_unlock;
 
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 
+	lock_page(page);
 	/*
 	 * return if we have all the buffers mapped. This avoid
 	 * the need to call write_begin/write_end which does a
@@ -5883,8 +5829,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 					ext4_bh_unmapped)) {
-			up_read(&inode->i_alloc_sem);
-			return VM_FAULT_LOCKED;
+			unlock_page(page);
+			goto out_unlock;
 		}
 	}
 	unlock_page(page);
@@ -5904,16 +5850,6 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret < 0)
 		goto out_unlock;
 	ret = 0;
-
-	/*
-	 * write_begin/end might have created a dirty page and someone
-	 * could wander in and start the IO.  Make sure that hasn't
-	 * happened.
-	 */
-	lock_page(page);
-	wait_on_page_writeback(page);
-	up_read(&inode->i_alloc_sem);
-	return VM_FAULT_LOCKED;
 out_unlock:
 	if (ret)
 		ret = VM_FAULT_SIGBUS;
diff --git a/trunk/fs/ext4/mballoc.c b/trunk/fs/ext4/mballoc.c
index 859f2ae8864e..d8a16eecf1d5 100644
--- a/trunk/fs/ext4/mballoc.c
+++ b/trunk/fs/ext4/mballoc.c
@@ -787,7 +787,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	struct inode *inode;
 	char *data;
 	char *bitmap;
-	struct ext4_group_info *grinfo;
 
 	mb_debug(1, "init page %lu\n", page->index);
 
@@ -820,18 +819,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (first_group + i >= ngroups)
 			break;
 
-		grinfo = ext4_get_group_info(sb, first_group + i);
-		/*
-		 * If page is uptodate then we came here after online resize
-		 * which added some new uninitialized group info structs, so
-		 * we must skip all initialized uptodate buddies on the page,
-		 * which may be currently in use by an allocating task.
-		 */
-		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
-			bh[i] = NULL;
-			continue;
-		}
-
 		err = -EIO;
 		desc = ext4_get_group_desc(sb, first_group + i, NULL);
 		if (desc == NULL)
@@ -884,28 +871,26 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	}
 
 	/* wait for I/O completion */
-	for (i = 0; i < groups_per_page; i++)
-		if (bh[i])
-			wait_on_buffer(bh[i]);
+	for (i = 0; i < groups_per_page && bh[i]; i++)
+		wait_on_buffer(bh[i]);
 
 	err = -EIO;
-	for (i = 0; i < groups_per_page; i++)
-		if (bh[i] && !buffer_uptodate(bh[i]))
+	for (i = 0; i < groups_per_page && bh[i]; i++)
+		if (!buffer_uptodate(bh[i]))
 			goto out;
 
 	err = 0;
 	first_block = page->index * blocks_per_page;
+	/* init the page  */
+	memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
+		struct ext4_group_info *grinfo;
 
 		group = (first_block + i) >> 1;
 		if (group >= ngroups)
 			break;
 
-		if (!bh[group - first_group])
-			/* skip initialized uptodate buddy */
-			continue;
-
 		/*
 		 * data carry information regarding this
 		 * particular group in the format specified
@@ -934,8 +919,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			 * incore got set to the group block bitmap below
 			 */
 			ext4_lock_group(sb, group);
-			/* init the buddy */
-			memset(data, 0xff, blocksize);
 			ext4_mb_generate_buddy(sb, data, incore, group);
 			ext4_unlock_group(sb, group);
 			incore = NULL;
@@ -965,7 +948,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 out:
 	if (bh) {
-		for (i = 0; i < groups_per_page; i++)
+		for (i = 0; i < groups_per_page && bh[i]; i++)
 			brelse(bh[i]);
 		if (bh != &bhs)
 			kfree(bh);
@@ -974,21 +957,22 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 }
 
 /*
- * Lock the buddy and bitmap pages. This make sure other parallel init_group
- * on the same buddy page doesn't happen whild holding the buddy page lock.
- * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
  */
-static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
-		ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+					ext4_group_t group)
 {
-	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
-	int block, pnum, poff;
+	int i;
+	int block, pnum;
 	int blocks_per_page;
-	struct page *page;
-
-	e4b->bd_buddy_page = NULL;
-	e4b->bd_bitmap_page = NULL;
+	int groups_per_page;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	/*
@@ -998,40 +982,57 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 	 */
 	block = group * 2;
 	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (!page)
-		return -EIO;
-	BUG_ON(page->mapping != inode->i_mapping);
-	e4b->bd_bitmap_page = page;
-	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+	first_group = pnum * blocks_per_page / 2;
 
-	if (blocks_per_page >= 2) {
-		/* buddy and bitmap are on the same page */
-		return 0;
-	}
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
 
-	block++;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (!page)
-		return -EIO;
-	BUG_ON(page->mapping != inode->i_mapping);
-	e4b->bd_buddy_page = page;
-	return 0;
+		if ((first_group + i) >= ngroups)
+			break;
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		down_write_nested(&grp->alloc_sem, i);
+	}
+	return i;
 }
 
-static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+					 ext4_group_t group, int locked_group)
 {
-	if (e4b->bd_bitmap_page) {
-		unlock_page(e4b->bd_bitmap_page);
-		page_cache_release(e4b->bd_bitmap_page);
-	}
-	if (e4b->bd_buddy_page) {
-		unlock_page(e4b->bd_buddy_page);
-		page_cache_release(e4b->bd_buddy_page);
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+	/* release locks on all the groups */
+	for (i = 0; i < locked_group; i++) {
+
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		up_write(&grp->alloc_sem);
 	}
+
 }
 
 /*
@@ -1043,60 +1044,93 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
 
-	struct ext4_group_info *this_grp;
-	struct ext4_buddy e4b;
-	struct page *page;
 	int ret = 0;
+	void *bitmap;
+	int blocks_per_page;
+	int block, pnum, poff;
+	int num_grp_locked = 0;
+	struct ext4_group_info *this_grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	struct page *page = NULL, *bitmap_page = NULL;
 
 	mb_debug(1, "init group %u\n", group);
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
 	 * This ensures that we don't reinit the buddy cache
 	 * page which map to the group from which we are already
 	 * allocating. If we are looking at the buddy cache we would
 	 * have taken a reference using ext4_mb_load_buddy and that
-	 * would have pinned buddy page to page cache.
+	 * would have taken the alloc_sem lock.
 	 */
-	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
-	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
 		/*
 		 * somebody initialized the group
 		 * return without doing anything
 		 */
+		ret = 0;
 		goto err;
 	}
-
-	page = e4b.bd_bitmap_page;
-	ret = ext4_mb_init_cache(page, NULL);
-	if (ret)
-		goto err;
-	if (!PageUptodate(page)) {
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, NULL);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
 		ret = -EIO;
 		goto err;
 	}
 	mark_page_accessed(page);
+	bitmap_page = page;
+	bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-	if (e4b.bd_buddy_page == NULL) {
+	/* init buddy cache */
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page == bitmap_page) {
 		/*
 		 * If both the bitmap and buddy are in
 		 * the same page we don't need to force
 		 * init the buddy
 		 */
-		ret = 0;
-		goto err;
+		unlock_page(page);
+	} else if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, bitmap);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
 	}
-	/* init buddy cache */
-	page = e4b.bd_buddy_page;
-	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
-	if (ret)
-		goto err;
-	if (!PageUptodate(page)) {
+	if (page == NULL || !PageUptodate(page)) {
 		ret = -EIO;
 		goto err;
 	}
 	mark_page_accessed(page);
 err:
-	ext4_mb_put_buddy_page_lock(&e4b);
+	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+	if (bitmap_page)
+		page_cache_release(bitmap_page);
+	if (page)
+		page_cache_release(page);
 	return ret;
 }
 
@@ -1130,8 +1164,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	e4b->bd_group = group;
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
+	e4b->alloc_semp = &grp->alloc_sem;
+
+	/* Take the read lock on the group alloc
+	 * sem. This would make sure a parallel
+	 * ext4_mb_init_group happening on other
+	 * groups mapped by the page is blocked
+	 * till we are done with allocation
+	 */
+repeat_load_buddy:
+	down_read(e4b->alloc_semp);
 
 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		/* we need to check for group need init flag
+		 * with alloc_semp held so that we can be sure
+		 * that new blocks didn't get added to the group
+		 * when we are loading the buddy cache
+		 */
+		up_read(e4b->alloc_semp);
 		/*
 		 * we need full data about the group
 		 * to make a good selection
@@ -1139,6 +1189,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		ret = ext4_mb_init_group(sb, group);
 		if (ret)
 			return ret;
+		goto repeat_load_buddy;
 	}
 
 	/*
@@ -1222,14 +1273,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	return 0;
 
 err:
-	if (page)
-		page_cache_release(page);
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
+
+	/* Done with the buddy cache */
+	up_read(e4b->alloc_semp);
 	return ret;
 }
 
@@ -1239,6 +1291,9 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
+	/* Done with the buddy cache */
+	if (e4b->alloc_semp)
+		up_read(e4b->alloc_semp);
 }
 
 
@@ -1551,6 +1606,9 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	get_page(ac->ac_bitmap_page);
 	ac->ac_buddy_page = e4b->bd_buddy_page;
 	get_page(ac->ac_buddy_page);
+	/* on allocation we use ac to track the held semaphore */
+	ac->alloc_semp =  e4b->alloc_semp;
+	e4b->alloc_semp = NULL;
 	/* store last allocated for subsequent stream allocation */
 	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		spin_lock(&sbi->s_md_lock);
@@ -2601,7 +2659,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	struct super_block *sb = journal->j_private;
 	struct ext4_buddy e4b;
 	struct ext4_group_info *db;
-	int err, count = 0, count2 = 0;
+	int err, ret, count = 0, count2 = 0;
 	struct ext4_free_data *entry;
 	struct list_head *l, *ltmp;
 
@@ -2611,9 +2669,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, entry->group,
-					   entry->start_blk, entry->count);
+		if (test_opt(sb, DISCARD)) {
+			ret = ext4_issue_discard(sb, entry->group,
+					entry->start_blk, entry->count);
+			if (unlikely(ret == -EOPNOTSUPP)) {
+				ext4_warning(sb, "discard not supported, "
+						 "disabling");
+				clear_opt(sb, DISCARD);
+			}
+		}
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
@@ -4162,12 +4226,15 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 			spin_unlock(&pa->pa_lock);
 		}
 	}
+	if (ac->alloc_semp)
+		up_read(ac->alloc_semp);
 	if (pa) {
 		/*
 		 * We want to add the pa to the right bucket.
 		 * Remove it from the list and while adding
 		 * make sure the list to which we are adding
-		 * doesn't grow big.
+		 * doesn't grow big.  We need to release
+		 * alloc_semp before calling ext4_mb_add_n_trim()
 		 */
 		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
 			spin_lock(pa->pa_obj_lock);
@@ -4236,9 +4303,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		 * there is enough free blocks to do block allocation
 		 * and verify allocation doesn't exceed the quota limits.
 		 */
-		while (ar->len &&
-			ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
-
+		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
 			/* let others to free the space */
 			yield();
 			ar->len = ar->len >> 1;
@@ -4248,15 +4313,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			return 0;
 		}
 		reserv_blks = ar->len;
-		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-			dquot_alloc_block_nofail(ar->inode, ar->len);
-		} else {
-			while (ar->len &&
-				dquot_alloc_block(ar->inode, ar->len)) {
-
-				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
-				ar->len--;
-			}
+		while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
+			ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+			ar->len--;
 		}
 		inquota = ar->len;
 		if (ar->len == 0) {
@@ -4644,127 +4703,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	return;
 }
 
-/**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle:			handle to this transaction
- * @sb:				super block
- * @block:			start physcial block to add to the block group
- * @count:			number of blocks to free
- *
- * This marks the blocks as free in the bitmap and buddy.
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-			 ext4_fsblk_t block, unsigned long count)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gd_bh;
-	ext4_group_t block_group;
-	ext4_grpblk_t bit;
-	unsigned int i;
-	struct ext4_group_desc *desc;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_buddy e4b;
-	int err = 0, ret, blk_free_count;
-	ext4_grpblk_t blocks_freed;
-	struct ext4_group_info *grp;
-
-	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-
-	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-	grp = ext4_get_group_info(sb, block_group);
-	/*
-	 * Check to see if we are freeing blocks across a group
-	 * boundary.
-	 */
-	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
-		goto error_return;
-
-	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
-		goto error_return;
-	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-	if (!desc)
-		goto error_return;
-
-	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, desc),
-		     sbi->s_itb_per_group)) {
-		ext4_error(sb, "Adding blocks in system zones - "
-			   "Block = %llu, count = %lu",
-			   block, count);
-		goto error_return;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "getting write access");
-	err = ext4_journal_get_write_access(handle, bitmap_bh);
-	if (err)
-		goto error_return;
-
-	/*
-	 * We are about to modify some metadata.  Call the journal APIs
-	 * to unshare ->b_data if a currently-committing transaction is
-	 * using it
-	 */
-	BUFFER_TRACE(gd_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, gd_bh);
-	if (err)
-		goto error_return;
-
-	for (i = 0, blocks_freed = 0; i < count; i++) {
-		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
-			ext4_error(sb, "bit already cleared for block %llu",
-				   (ext4_fsblk_t)(block + i));
-			BUFFER_TRACE(bitmap_bh, "bit already cleared");
-		} else {
-			blocks_freed++;
-		}
-	}
-
-	err = ext4_mb_load_buddy(sb, block_group, &e4b);
-	if (err)
-		goto error_return;
-
-	/*
-	 * need to update group_info->bb_free and bitmap
-	 * with group lock held. generate_buddy look at
-	 * them with group lock_held
-	 */
-	ext4_lock_group(sb, block_group);
-	mb_clear_bits(bitmap_bh->b_data, bit, count);
-	mb_free_blocks(NULL, &e4b, bit, count);
-	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-	ext4_free_blks_set(sb, desc, blk_free_count);
-	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-	ext4_unlock_group(sb, block_group);
-	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(blocks_freed,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
-	}
-
-	ext4_mb_unload_buddy(&e4b);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-	if (!err)
-		err = ret;
-
-error_return:
-	brelse(bitmap_bh);
-	ext4_std_error(sb, err);
-	return;
-}
-
 /**
  * ext4_trim_extent -- function to TRIM one single free extent in the group
  * @sb:		super block for the file system
@@ -4777,10 +4715,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static void ext4_trim_extent(struct super_block *sb, int start, int count,
-			     ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
 {
 	struct ext4_free_extent ex;
+	int ret = 0;
 
 	assert_spin_locked(ext4_group_lock_ptr(sb, group));
 
@@ -4794,9 +4733,12 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
 	 */
 	mb_mark_used(e4b, &ex);
 	ext4_unlock_group(sb, group);
-	ext4_issue_discard(sb, group, start, count);
+
+	ret = ext4_issue_discard(sb, group, start, count);
+
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
 }
 
 /**
@@ -4818,26 +4760,21 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
  * the group buddy bitmap. This is done until whole group is scanned.
  */
 static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
-		   ext4_grpblk_t start, ext4_grpblk_t max,
-		   ext4_grpblk_t minblocks)
+ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
 {
 	void *bitmap;
 	ext4_grpblk_t next, count = 0;
-	struct ext4_buddy e4b;
-	int ret;
+	ext4_group_t group;
+	int ret = 0;
 
-	ret = ext4_mb_load_buddy(sb, group, &e4b);
-	if (ret) {
-		ext4_error(sb, "Error in loading buddy "
-				"information for %u", group);
-		return ret;
-	}
-	bitmap = e4b.bd_bitmap;
+	BUG_ON(e4b == NULL);
 
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
 	ext4_lock_group(sb, group);
-	start = (e4b.bd_info->bb_first_free > start) ?
-		e4b.bd_info->bb_first_free : start;
 
 	while (start < max) {
 		start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4846,8 +4783,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 		next = mb_find_next_bit(bitmap, max, start);
 
 		if ((next - start) >= minblocks) {
-			ext4_trim_extent(sb, start,
-					 next - start, group, &e4b);
+			ret = ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+			if (ret < 0)
+				break;
 			count += next - start;
 		}
 		start = next + 1;
@@ -4863,15 +4802,17 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 			ext4_lock_group(sb, group);
 		}
 
-		if ((e4b.bd_info->bb_free - count) < minblocks)
+		if ((e4b->bd_info->bb_free - count) < minblocks)
 			break;
 	}
 	ext4_unlock_group(sb, group);
-	ext4_mb_unload_buddy(&e4b);
 
 	ext4_debug("trimmed %d blocks in the group %d\n",
 		count, group);
 
+	if (ret < 0)
+		count = ret;
+
 	return count;
 }
 
@@ -4889,11 +4830,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
  */
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
-	struct ext4_group_info *grp;
+	struct ext4_buddy e4b;
 	ext4_group_t first_group, last_group;
 	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
 	ext4_grpblk_t cnt = 0, first_block, last_block;
-	uint64_t start, len, minlen, trimmed = 0;
+	uint64_t start, len, minlen, trimmed;
 	ext4_fsblk_t first_data_blk =
 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 	int ret = 0;
@@ -4901,6 +4842,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	start = range->start >> sb->s_blocksize_bits;
 	len = range->len >> sb->s_blocksize_bits;
 	minlen = range->minlen >> sb->s_blocksize_bits;
+	trimmed = 0;
 
 	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
 		return -EINVAL;
@@ -4921,12 +4863,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		return -EINVAL;
 
 	for (group = first_group; group <= last_group; group++) {
-		grp = ext4_get_group_info(sb, group);
-		/* We only do this if the grp has never been initialized */
-		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-			ret = ext4_mb_init_group(sb, group);
-			if (ret)
-				break;
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			break;
 		}
 
 		/*
@@ -4939,14 +4880,16 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 			last_block = first_block + len;
 		len -= last_block - first_block;
 
-		if (grp->bb_free >= minlen) {
-			cnt = ext4_trim_all_free(sb, group, first_block,
+		if (e4b.bd_info->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, &e4b, first_block,
 						last_block, minlen);
 			if (cnt < 0) {
 				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
 				break;
 			}
 		}
+		ext4_mb_unload_buddy(&e4b);
 		trimmed += cnt;
 		first_block = 0;
 	}
diff --git a/trunk/fs/ext4/mballoc.h b/trunk/fs/ext4/mballoc.h
index 20b5e7bfebd1..22bd4d7f289b 100644
--- a/trunk/fs/ext4/mballoc.h
+++ b/trunk/fs/ext4/mballoc.h
@@ -193,6 +193,11 @@ struct ext4_allocation_context {
 	__u8 ac_op;		/* operation, for history only */
 	struct page *ac_bitmap_page;
 	struct page *ac_buddy_page;
+	/*
+	 * pointer to the held semaphore upon successful
+	 * block allocation
+	 */
+	struct rw_semaphore *alloc_semp;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
 };
@@ -210,6 +215,7 @@ struct ext4_buddy {
 	struct super_block *bd_sb;
 	__u16 bd_blkbits;
 	ext4_group_t bd_group;
+	struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
diff --git a/trunk/fs/ext4/migrate.c b/trunk/fs/ext4/migrate.c
index b57b98fb44d1..92816b4e0f16 100644
--- a/trunk/fs/ext4/migrate.c
+++ b/trunk/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
 	 */
-	ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+	ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
 	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
 
 	/*
diff --git a/trunk/fs/ext4/mmp.c b/trunk/fs/ext4/mmp.c
deleted file mode 100644
index 9bdef3f537c5..000000000000
--- a/trunk/fs/ext4/mmp.c
+++ /dev/null
@@ -1,351 +0,0 @@
-#include <linux/fs.h>
-#include <linux/random.h>
-#include <linux/buffer_head.h>
-#include <linux/utsname.h>
-#include <linux/kthread.h>
-
-#include "ext4.h"
-
-/*
- * Write the MMP block using WRITE_SYNC to try to get the block on-disk
- * faster.
- */
-static int write_mmp_block(struct buffer_head *bh)
-{
-	mark_buffer_dirty(bh);
-	lock_buffer(bh);
-	bh->b_end_io = end_buffer_write_sync;
-	get_bh(bh);
-	submit_bh(WRITE_SYNC, bh);
-	wait_on_buffer(bh);
-	if (unlikely(!buffer_uptodate(bh)))
-		return 1;
-
-	return 0;
-}
-
-/*
- * Read the MMP block. It _must_ be read from disk and hence we clear the
- * uptodate flag on the buffer.
- */
-static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
-			  ext4_fsblk_t mmp_block)
-{
-	struct mmp_struct *mmp;
-
-	if (*bh)
-		clear_buffer_uptodate(*bh);
-
-	/* This would be sb_bread(sb, mmp_block), except we need to be sure
-	 * that the MD RAID device cache has been bypassed, and that the read
-	 * is not blocked in the elevator. */
-	if (!*bh)
-		*bh = sb_getblk(sb, mmp_block);
-	if (*bh) {
-		get_bh(*bh);
-		lock_buffer(*bh);
-		(*bh)->b_end_io = end_buffer_read_sync;
-		submit_bh(READ_SYNC, *bh);
-		wait_on_buffer(*bh);
-		if (!buffer_uptodate(*bh)) {
-			brelse(*bh);
-			*bh = NULL;
-		}
-	}
-	if (!*bh) {
-		ext4_warning(sb, "Error while reading MMP block %llu",
-			     mmp_block);
-		return -EIO;
-	}
-
-	mmp = (struct mmp_struct *)((*bh)->b_data);
-	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
-		return -EINVAL;
-
-	return 0;
-}
-
-/*
- * Dump as much information as possible to help the admin.
- */
-void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
-		    const char *function, unsigned int line, const char *msg)
-{
-	__ext4_warning(sb, function, line, msg);
-	__ext4_warning(sb, function, line,
-		       "MMP failure info: last update time: %llu, last update "
-		       "node: %s, last update device: %s\n",
-		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
-		       mmp->mmp_nodename, mmp->mmp_bdevname);
-}
-
-/*
- * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
- */
-static int kmmpd(void *data)
-{
-	struct super_block *sb = ((struct mmpd_data *) data)->sb;
-	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
-	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-	struct mmp_struct *mmp;
-	ext4_fsblk_t mmp_block;
-	u32 seq = 0;
-	unsigned long failed_writes = 0;
-	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
-	unsigned mmp_check_interval;
-	unsigned long last_update_time;
-	unsigned long diff;
-	int retval;
-
-	mmp_block = le64_to_cpu(es->s_mmp_block);
-	mmp = (struct mmp_struct *)(bh->b_data);
-	mmp->mmp_time = cpu_to_le64(get_seconds());
-	/*
-	 * Start with the higher mmp_check_interval and reduce it if
-	 * the MMP block is being updated on time.
-	 */
-	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
-				 EXT4_MMP_MIN_CHECK_INTERVAL);
-	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
-	bdevname(bh->b_bdev, mmp->mmp_bdevname);
-
-	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
-	       sizeof(mmp->mmp_nodename));
-
-	while (!kthread_should_stop()) {
-		if (++seq > EXT4_MMP_SEQ_MAX)
-			seq = 1;
-
-		mmp->mmp_seq = cpu_to_le32(seq);
-		mmp->mmp_time = cpu_to_le64(get_seconds());
-		last_update_time = jiffies;
-
-		retval = write_mmp_block(bh);
-		/*
-		 * Don't spew too many error messages. Print one every
-		 * (s_mmp_update_interval * 60) seconds.
-		 */
-		if (retval && (failed_writes % 60) == 0) {
-			ext4_error(sb, "Error writing to MMP block");
-			failed_writes++;
-		}
-
-		if (!(le32_to_cpu(es->s_feature_incompat) &
-		    EXT4_FEATURE_INCOMPAT_MMP)) {
-			ext4_warning(sb, "kmmpd being stopped since MMP feature"
-				     " has been disabled.");
-			EXT4_SB(sb)->s_mmp_tsk = NULL;
-			goto failed;
-		}
-
-		if (sb->s_flags & MS_RDONLY) {
-			ext4_warning(sb, "kmmpd being stopped since filesystem "
-				     "has been remounted as readonly.");
-			EXT4_SB(sb)->s_mmp_tsk = NULL;
-			goto failed;
-		}
-
-		diff = jiffies - last_update_time;
-		if (diff < mmp_update_interval * HZ)
-			schedule_timeout_interruptible(mmp_update_interval *
-						       HZ - diff);
-
-		/*
-		 * We need to make sure that more than mmp_check_interval
-		 * seconds have not passed since writing. If that has happened
-		 * we need to check if the MMP block is as we left it.
-		 */
-		diff = jiffies - last_update_time;
-		if (diff > mmp_check_interval * HZ) {
-			struct buffer_head *bh_check = NULL;
-			struct mmp_struct *mmp_check;
-
-			retval = read_mmp_block(sb, &bh_check, mmp_block);
-			if (retval) {
-				ext4_error(sb, "error reading MMP data: %d",
-					   retval);
-
-				EXT4_SB(sb)->s_mmp_tsk = NULL;
-				goto failed;
-			}
-
-			mmp_check = (struct mmp_struct *)(bh_check->b_data);
-			if (mmp->mmp_seq != mmp_check->mmp_seq ||
-			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
-				   sizeof(mmp->mmp_nodename))) {
-				dump_mmp_msg(sb, mmp_check,
-					     "Error while updating MMP info. "
-					     "The filesystem seems to have been"
-					     " multiply mounted.");
-				ext4_error(sb, "abort");
-				goto failed;
-			}
-			put_bh(bh_check);
-		}
-
-		 /*
-		 * Adjust the mmp_check_interval depending on how much time
-		 * it took for the MMP block to be written.
-		 */
-		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
-					     EXT4_MMP_MAX_CHECK_INTERVAL),
-					 EXT4_MMP_MIN_CHECK_INTERVAL);
-		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
-	}
-
-	/*
-	 * Unmount seems to be clean.
-	 */
-	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
-	mmp->mmp_time = cpu_to_le64(get_seconds());
-
-	retval = write_mmp_block(bh);
-
-failed:
-	kfree(data);
-	brelse(bh);
-	return retval;
-}
-
-/*
- * Get a random new sequence number but make sure it is not greater than
- * EXT4_MMP_SEQ_MAX.
- */
-static unsigned int mmp_new_seq(void)
-{
-	u32 new_seq;
-
-	do {
-		get_random_bytes(&new_seq, sizeof(u32));
-	} while (new_seq > EXT4_MMP_SEQ_MAX);
-
-	return new_seq;
-}
-
-/*
- * Protect the filesystem from being mounted more than once.
- */
-int ext4_multi_mount_protect(struct super_block *sb,
-				    ext4_fsblk_t mmp_block)
-{
-	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-	struct buffer_head *bh = NULL;
-	struct mmp_struct *mmp = NULL;
-	struct mmpd_data *mmpd_data;
-	u32 seq;
-	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
-	unsigned int wait_time = 0;
-	int retval;
-
-	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
-	    mmp_block >= ext4_blocks_count(es)) {
-		ext4_warning(sb, "Invalid MMP block in superblock");
-		goto failed;
-	}
-
-	retval = read_mmp_block(sb, &bh, mmp_block);
-	if (retval)
-		goto failed;
-
-	mmp = (struct mmp_struct *)(bh->b_data);
-
-	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
-		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
-
-	/*
-	 * If check_interval in MMP block is larger, use that instead of
-	 * update_interval from the superblock.
-	 */
-	if (mmp->mmp_check_interval > mmp_check_interval)
-		mmp_check_interval = mmp->mmp_check_interval;
-
-	seq = le32_to_cpu(mmp->mmp_seq);
-	if (seq == EXT4_MMP_SEQ_CLEAN)
-		goto skip;
-
-	if (seq == EXT4_MMP_SEQ_FSCK) {
-		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
-		goto failed;
-	}
-
-	wait_time = min(mmp_check_interval * 2 + 1,
-			mmp_check_interval + 60);
-
-	/* Print MMP interval if more than 20 secs. */
-	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
-		ext4_warning(sb, "MMP interval %u higher than expected, please"
-			     " wait.\n", wait_time * 2);
-
-	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
-		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
-		goto failed;
-	}
-
-	retval = read_mmp_block(sb, &bh, mmp_block);
-	if (retval)
-		goto failed;
-	mmp = (struct mmp_struct *)(bh->b_data);
-	if (seq != le32_to_cpu(mmp->mmp_seq)) {
-		dump_mmp_msg(sb, mmp,
-			     "Device is already active on another node.");
-		goto failed;
-	}
-
-skip:
-	/*
-	 * write a new random sequence number.
-	 */
-	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
-
-	retval = write_mmp_block(bh);
-	if (retval)
-		goto failed;
-
-	/*
-	 * wait for MMP interval and check mmp_seq.
-	 */
-	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
-		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
-		goto failed;
-	}
-
-	retval = read_mmp_block(sb, &bh, mmp_block);
-	if (retval)
-		goto failed;
-	mmp = (struct mmp_struct *)(bh->b_data);
-	if (seq != le32_to_cpu(mmp->mmp_seq)) {
-		dump_mmp_msg(sb, mmp,
-			     "Device is already active on another node.");
-		goto failed;
-	}
-
-	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
-	if (!mmpd_data) {
-		ext4_warning(sb, "not enough memory for mmpd_data");
-		goto failed;
-	}
-	mmpd_data->sb = sb;
-	mmpd_data->bh = bh;
-
-	/*
-	 * Start a kernel thread to update the MMP block periodically.
-	 */
-	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
-					     bdevname(bh->b_bdev,
-						      mmp->mmp_bdevname));
-	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
-		EXT4_SB(sb)->s_mmp_tsk = NULL;
-		kfree(mmpd_data);
-		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
-			     sb->s_id);
-		goto failed;
-	}
-
-	return 0;
-
-failed:
-	brelse(bh);
-	return 1;
-}
-
-
diff --git a/trunk/fs/ext4/move_extent.c b/trunk/fs/ext4/move_extent.c
index 2b8304bf3c50..b9f3e7862f13 100644
--- a/trunk/fs/ext4/move_extent.c
+++ b/trunk/fs/ext4/move_extent.c
@@ -876,7 +876,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	 * It needs to call wait_on_page_writeback() to wait for the
 	 * writeback of the page.
 	 */
-	wait_on_page_writeback(page);
+	if (PageWriteback(page))
+		wait_on_page_writeback(page);
 
 	/* Release old bh and drop refs */
 	try_to_release_page(page, 0);
diff --git a/trunk/fs/ext4/namei.c b/trunk/fs/ext4/namei.c
index b754b7721f51..67fd0b025858 100644
--- a/trunk/fs/ext4/namei.c
+++ b/trunk/fs/ext4/namei.c
@@ -1413,22 +1413,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	frame->at = entries;
 	frame->bh = bh;
 	bh = bh2;
-
-	ext4_handle_dirty_metadata(handle, dir, frame->bh);
-	ext4_handle_dirty_metadata(handle, dir, bh);
-
 	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-	if (!de) {
-		/*
-		 * Even if the block split failed, we have to properly write
-		 * out all the changes we did so far. Otherwise we can end up
-		 * with corrupted filesystem.
-		 */
-		ext4_mark_inode_dirty(handle, dir);
-		dx_release(frames);
+	dx_release (frames);
+	if (!(de))
 		return retval;
-	}
-	dx_release(frames);
 
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
@@ -2252,7 +2240,6 @@ static int ext4_symlink(struct inode *dir,
 	handle_t *handle;
 	struct inode *inode;
 	int l, err, retries = 0;
-	int credits;
 
 	l = strlen(symname)+1;
 	if (l > dir->i_sb->s_blocksize)
@@ -2260,26 +2247,10 @@ static int ext4_symlink(struct inode *dir,
 
 	dquot_initialize(dir);
 
-	if (l > EXT4_N_BLOCKS * 4) {
-		/*
-		 * For non-fast symlinks, we just allocate inode and put it on
-		 * orphan list in the first transaction => we need bitmap,
-		 * group descriptor, sb, inode block, quota blocks.
-		 */
-		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
-	} else {
-		/*
-		 * Fast symlink. We have to add entry to directory
-		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
-		 * allocate new inode (bitmap, group descriptor, inode block,
-		 * quota blocks, sb is already counted in previous macros).
-		 */
-		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
-	}
 retry:
-	handle = ext4_journal_start(dir, credits);
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2292,44 +2263,21 @@ static int ext4_symlink(struct inode *dir,
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > EXT4_N_BLOCKS * 4) {
+	if (l > sizeof(EXT4_I(inode)->i_data)) {
 		inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
-		 * We cannot call page_symlink() with transaction started
-		 * because it calls into ext4_write_begin() which can wait
-		 * for transaction commit if we are running out of space
-		 * and thus we deadlock. So we have to stop transaction now
-		 * and restart it when symlink contents is written.
-		 * 
-		 * To keep fs consistent in case of crash, we have to put inode
-		 * to orphan list in the mean time.
+		 * page_symlink() calls into ext4_prepare/commit_write.
+		 * We have a transaction open.  All is sweetness.  It also sets
+		 * i_size in generic_commit_write().
 		 */
-		drop_nlink(inode);
-		err = ext4_orphan_add(handle, inode);
-		ext4_journal_stop(handle);
-		if (err)
-			goto err_drop_inode;
 		err = __page_symlink(inode, symname, l, 1);
-		if (err)
-			goto err_drop_inode;
-		/*
-		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
-		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
-		 */
-		handle = ext4_journal_start(dir,
-				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
-			goto err_drop_inode;
-		}
-		inc_nlink(inode);
-		err = ext4_orphan_del(handle, inode);
 		if (err) {
-			ext4_journal_stop(handle);
 			clear_nlink(inode);
-			goto err_drop_inode;
+			unlock_new_inode(inode);
+			ext4_mark_inode_dirty(handle, inode);
+			iput(inode);
+			goto out_stop;
 		}
 	} else {
 		/* clear the extent format for fast symlink */
@@ -2345,10 +2293,6 @@ static int ext4_symlink(struct inode *dir,
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
-err_drop_inode:
-	unlock_new_inode(inode);
-	iput(inode);
-	return err;
 }
 
 static int ext4_link(struct dentry *old_dentry,
diff --git a/trunk/fs/ext4/page-io.c b/trunk/fs/ext4/page-io.c
index 7bb8f76d470a..b6dbd056fcb1 100644
--- a/trunk/fs/ext4/page-io.c
+++ b/trunk/fs/ext4/page-io.c
@@ -203,29 +203,46 @@ static void ext4_end_bio(struct bio *bio, int error)
 	for (i = 0; i < io_end->num_io_pages; i++) {
 		struct page *page = io_end->pages[i]->p_page;
 		struct buffer_head *bh, *head;
-		loff_t offset;
-		loff_t io_end_offset;
+		int partial_write = 0;
 
-		if (error) {
+		head = page_buffers(page);
+		if (error)
 			SetPageError(page);
-			set_bit(AS_EIO, &page->mapping->flags);
-			head = page_buffers(page);
-			BUG_ON(!head);
-
-			io_end_offset = io_end->offset + io_end->size;
+		BUG_ON(!head);
+		if (head->b_size != PAGE_CACHE_SIZE) {
+			loff_t offset;
+			loff_t io_end_offset = io_end->offset + io_end->size;
 
 			offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
 			bh = head;
 			do {
 				if ((offset >= io_end->offset) &&
-				    (offset+bh->b_size <= io_end_offset))
-					buffer_io_error(bh);
-
+				    (offset+bh->b_size <= io_end_offset)) {
+					if (error)
+						buffer_io_error(bh);
+
+				}
+				if (buffer_delay(bh))
+					partial_write = 1;
+				else if (!buffer_mapped(bh))
+					clear_buffer_dirty(bh);
+				else if (buffer_dirty(bh))
+					partial_write = 1;
 				offset += bh->b_size;
 				bh = bh->b_this_page;
 			} while (bh != head);
 		}
 
+		/*
+		 * If this is a partial write which happened to make
+		 * all buffers uptodate then we can optimize away a
+		 * bogus readpage() for the next read(). Here we
+		 * 'discover' whether the page went uptodate as a
+		 * result of this (potentially partial) write.
+		 */
+		if (!partial_write)
+			SetPageUptodate(page);
+
 		put_io_page(io_end->pages[i]);
 	}
 	io_end->num_io_pages = 0;
diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c
index cc5c157aa11d..8553dfb310af 100644
--- a/trunk/fs/ext4/super.c
+++ b/trunk/fs/ext4/super.c
@@ -38,7 +38,6 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
-#include <linux/cleancache.h>
 #include <asm/uaccess.h>
 
 #include <linux/kthread.h>
@@ -76,27 +75,11 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data);
-static inline int ext2_feature_set_ok(struct super_block *sb);
-static inline int ext3_feature_set_ok(struct super_block *sb);
 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 
-#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext2",
-	.mount		= ext4_mount,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
-#else
-#define IS_EXT2_SB(sb) (0)
-#endif
-
-
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
@@ -823,8 +806,6 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
-	if (sbi->s_mmp_tsk)
-		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
@@ -1115,7 +1096,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 
 	if (!test_opt(sb, INIT_INODE_TABLE))
 		seq_puts(seq, ",noinit_inode_table");
-	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
+	else if (sbi->s_li_wait_mult)
 		seq_printf(seq, ",init_inode_table=%u",
 			   (unsigned) sbi->s_li_wait_mult);
 
@@ -1206,7 +1187,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext4_quota_operations = {
+#ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
+#endif
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
@@ -1917,7 +1900,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		ext4_msg(sb, KERN_WARNING,
 			 "warning: mounting fs with errors, "
 			 "running e2fsck is recommended");
-	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
 		ext4_msg(sb, KERN_WARNING,
@@ -1949,7 +1932,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt, sbi->s_mount_opt2);
 
-	cleancache_init_fs(sb);
 	return res;
 }
 
@@ -2443,18 +2425,6 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 
-static ssize_t extent_cache_hits_show(struct ext4_attr *a,
-				      struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
-}
-
-static ssize_t extent_cache_misses_show(struct ext4_attr *a,
-					struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
-}
-
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  struct ext4_sb_info *sbi,
 					  const char *buf, size_t count)
@@ -2512,8 +2482,6 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RO_ATTR(extent_cache_hits);
-EXT4_RO_ATTR(extent_cache_misses);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2529,8 +2497,6 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
-	ATTR_LIST(extent_cache_hits),
-	ATTR_LIST(extent_cache_misses),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -2693,6 +2659,12 @@ static void print_daily_error_info(unsigned long arg)
 	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
 
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
 /* Find next suitable group and run ext4_init_inode_table */
 static int ext4_run_li_request(struct ext4_li_request *elr)
 {
@@ -2724,8 +2696,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 		ret = ext4_init_inode_table(sb, group,
 					    elr->lr_timeout ? 0 : 1);
 		if (elr->lr_timeout == 0) {
-			timeout = (jiffies - timeout) *
-				  elr->lr_sbi->s_li_wait_mult;
+			timeout = jiffies - timeout;
+			if (elr->lr_sbi->s_li_wait_mult)
+				timeout *= elr->lr_sbi->s_li_wait_mult;
+			else
+				timeout *= 20;
 			elr->lr_timeout = timeout;
 		}
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2737,7 +2712,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 
 /*
  * Remove lr_request from the list_request and free the
- * request structure. Should be called with li_list_mtx held
+ * request tructure. Should be called with li_list_mtx held
  */
 static void ext4_remove_li_request(struct ext4_li_request *elr)
 {
@@ -2755,16 +2730,14 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
 
 static void ext4_unregister_li_request(struct super_block *sb)
 {
-	mutex_lock(&ext4_li_mtx);
-	if (!ext4_li_info) {
-		mutex_unlock(&ext4_li_mtx);
+	struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+
+	if (!ext4_li_info)
 		return;
-	}
 
 	mutex_lock(&ext4_li_info->li_list_mtx);
-	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+	ext4_remove_li_request(elr);
 	mutex_unlock(&ext4_li_info->li_list_mtx);
-	mutex_unlock(&ext4_li_mtx);
 }
 
 static struct task_struct *ext4_lazyinit_task;
@@ -2783,10 +2756,17 @@ static int ext4_lazyinit_thread(void *arg)
 	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
 	struct list_head *pos, *n;
 	struct ext4_li_request *elr;
-	unsigned long next_wakeup, cur;
+	unsigned long next_wakeup;
+	DEFINE_WAIT(wait);
 
 	BUG_ON(NULL == eli);
 
+	eli->li_timer.data = (unsigned long)current;
+	eli->li_timer.function = ext4_lazyinode_timeout;
+
+	eli->li_task = current;
+	wake_up(&eli->li_wait_task);
+
 cont_thread:
 	while (true) {
 		next_wakeup = MAX_JIFFY_OFFSET;
@@ -2817,15 +2797,19 @@ static int ext4_lazyinit_thread(void *arg)
 		if (freezing(current))
 			refrigerator();
 
-		cur = jiffies;
-		if ((time_after_eq(cur, next_wakeup)) ||
+		if ((time_after_eq(jiffies, next_wakeup)) ||
 		    (MAX_JIFFY_OFFSET == next_wakeup)) {
 			cond_resched();
 			continue;
 		}
 
-		schedule_timeout_interruptible(next_wakeup - cur);
-
+		eli->li_timer.expires = next_wakeup;
+		add_timer(&eli->li_timer);
+		prepare_to_wait(&eli->li_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+		if (time_before(jiffies, next_wakeup))
+			schedule();
+		finish_wait(&eli->li_wait_daemon, &wait);
 		if (kthread_should_stop()) {
 			ext4_clear_request_list();
 			goto exit_thread;
@@ -2849,7 +2833,12 @@ static int ext4_lazyinit_thread(void *arg)
 		goto cont_thread;
 	}
 	mutex_unlock(&eli->li_list_mtx);
+	del_timer_sync(&ext4_li_info->li_timer);
+	eli->li_task = NULL;
+	wake_up(&eli->li_wait_task);
+
 	kfree(ext4_li_info);
+	ext4_lazyinit_task = NULL;
 	ext4_li_info = NULL;
 	mutex_unlock(&ext4_li_mtx);
 
@@ -2877,6 +2866,7 @@ static int ext4_run_lazyinit_thread(void)
 	if (IS_ERR(ext4_lazyinit_task)) {
 		int err = PTR_ERR(ext4_lazyinit_task);
 		ext4_clear_request_list();
+		del_timer_sync(&ext4_li_info->li_timer);
 		kfree(ext4_li_info);
 		ext4_li_info = NULL;
 		printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2885,6 +2875,8 @@ static int ext4_run_lazyinit_thread(void)
 		return err;
 	}
 	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+
+	wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
 	return 0;
 }
 
@@ -2919,9 +2911,13 @@ static int ext4_li_info_new(void)
 	if (!eli)
 		return -ENOMEM;
 
+	eli->li_task = NULL;
 	INIT_LIST_HEAD(&eli->li_request_list);
 	mutex_init(&eli->li_list_mtx);
 
+	init_waitqueue_head(&eli->li_wait_daemon);
+	init_waitqueue_head(&eli->li_wait_task);
+	init_timer(&eli->li_timer);
 	eli->li_state |= EXT4_LAZYINIT_QUIT;
 
 	ext4_li_info = eli;
@@ -2964,19 +2960,20 @@ static int ext4_register_li_request(struct super_block *sb,
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	int ret = 0;
 
-	if (sbi->s_li_request != NULL) {
-		/*
-		 * Reset timeout so it can be computed again, because
-		 * s_li_wait_mult might have changed.
-		 */
-		sbi->s_li_request->lr_timeout = 0;
+	if (sbi->s_li_request != NULL)
 		return 0;
-	}
 
 	if (first_not_zeroed == ngroups ||
 	    (sb->s_flags & MS_RDONLY) ||
-	    !test_opt(sb, INIT_INODE_TABLE))
+	    !test_opt(sb, INIT_INODE_TABLE)) {
+		sbi->s_li_request = NULL;
 		return 0;
+	}
+
+	if (first_not_zeroed == ngroups) {
+		sbi->s_li_request = NULL;
+		return 0;
+	}
 
 	elr = ext4_li_request_new(sb, first_not_zeroed);
 	if (!elr)
@@ -3169,12 +3166,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
 		set_opt(sb, DELALLOC);
 
-	/*
-	 * set default s_li_wait_mult for lazyinit, for the case there is
-	 * no mount option specified.
-	 */
-	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
-
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
 			   &journal_devnum, &journal_ioprio, NULL, 0)) {
 		ext4_msg(sb, KERN_WARNING,
@@ -3196,28 +3187,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       "feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended");
 
-	if (IS_EXT2_SB(sb)) {
-		if (ext2_feature_set_ok(sb))
-			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
-				 "using the ext4 subsystem");
-		else {
-			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
-				 "to feature incompatibilities");
-			goto failed_mount;
-		}
-	}
-
-	if (IS_EXT3_SB(sb)) {
-		if (ext3_feature_set_ok(sb))
-			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
-				 "using the ext4 subsystem");
-		else {
-			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
-				 "to feature incompatibilities");
-			goto failed_mount;
-		}
-	}
-
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
@@ -3490,11 +3459,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
-	    !(sb->s_flags & MS_RDONLY))
-		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
-			goto failed_mount3;
-
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -3510,6 +3474,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	} else {
 		clear_opt(sb, DATA_FLAGS);
+		set_opt(sb, WRITEBACK_DATA);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
 		goto no_journal;
@@ -3742,8 +3707,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
-	if (sbi->s_mmp_tsk)
-		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -4279,7 +4242,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-	int err = 0;
+	int err;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -4405,13 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
-			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-						     EXT4_FEATURE_INCOMPAT_MMP))
-				if (ext4_multi_mount_protect(sb,
-						le64_to_cpu(es->s_mmp_block))) {
-					err = -EROFS;
-					goto restore_opts;
-				}
 			enable_quota = 1;
 		}
 	}
@@ -4476,7 +4432,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	u64 fsid;
-	s64 bfree;
 
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
@@ -4520,10 +4475,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
 		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
-	/* prevent underflow in case that few free space is available */
-	buf->f_bfree = max_t(s64, bfree, 0);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
 		buf->f_bavail = 0;
@@ -4699,9 +4652,6 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
 
-	if (!inode)
-		goto out;
-
 	/* Update modification times of quota files when userspace can
 	 * start looking at them */
 	handle = ext4_journal_start(inode, 1);
@@ -4822,6 +4772,14 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 }
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
 static inline void register_as_ext2(void)
 {
 	int err = register_filesystem(&ext2_fs_type);
@@ -4834,22 +4792,10 @@ static inline void unregister_as_ext2(void)
 {
 	unregister_filesystem(&ext2_fs_type);
 }
-
-static inline int ext2_feature_set_ok(struct super_block *sb)
-{
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
-		return 0;
-	if (sb->s_flags & MS_RDONLY)
-		return 1;
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
-		return 0;
-	return 1;
-}
 MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
-static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4865,24 +4811,10 @@ static inline void unregister_as_ext3(void)
 {
 	unregister_filesystem(&ext3_fs_type);
 }
-
-static inline int ext3_feature_set_ok(struct super_block *sb)
-{
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
-		return 0;
-	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
-		return 0;
-	if (sb->s_flags & MS_RDONLY)
-		return 1;
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
-		return 0;
-	return 1;
-}
 MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
-static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
 static struct file_system_type ext4_fs_type = {
@@ -4966,8 +4898,8 @@ static int __init ext4_init_fs(void)
 	err = init_inodecache();
 	if (err)
 		goto out1;
-	register_as_ext3();
 	register_as_ext2();
+	register_as_ext3();
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
diff --git a/trunk/fs/ext4/xattr.c b/trunk/fs/ext4/xattr.c
index c757adc97250..b545ca1c459c 100644
--- a/trunk/fs/ext4/xattr.c
+++ b/trunk/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
-			block = ext4_new_meta_blocks(handle, inode, goal, 0,
-						     NULL, &error);
+			block = ext4_new_meta_blocks(handle, inode,
+						  goal, NULL, &error);
 			if (error)
 				goto cleanup;
 
diff --git a/trunk/fs/fat/namei_msdos.c b/trunk/fs/fat/namei_msdos.c
index be15437c272e..3b222dafd15b 100644
--- a/trunk/fs/fat/namei_msdos.c
+++ b/trunk/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fat_slot_info sinfo;
 	int err;
 
-	dentry_unhash(dentry);
-
 	lock_super(sb);
 	/*
 	 * Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	err = fat_scan(old_dir, old_name, &old_sinfo);
 	if (err) {
 		err = -EIO;
diff --git a/trunk/fs/fat/namei_vfat.c b/trunk/fs/fat/namei_vfat.c
index c61a6789f36c..20b4ea53fdc4 100644
--- a/trunk/fs/fat/namei_vfat.c
+++ b/trunk/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fat_slot_info sinfo;
 	int err;
 
-	dentry_unhash(dentry);
-
 	lock_super(sb);
 
 	err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
diff --git a/trunk/fs/fuse/dir.c b/trunk/fs/fuse/dir.c
index 0d0e3faddcfa..b32eb29a4e6f 100644
--- a/trunk/fs/fuse/dir.c
+++ b/trunk/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	dentry_unhash(entry);
-
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
 	req->in.numargs = 1;
@@ -693,10 +691,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
 	struct fuse_req *req = fuse_get_req(fc);
-
-	if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
-		dentry_unhash(newent);
-
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
diff --git a/trunk/fs/hfs/dir.c b/trunk/fs/hfs/dir.c
index 1cb70cdba2c1..b4d70b13be92 100644
--- a/trunk/fs/hfs/dir.c
+++ b/trunk/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int res;
 
-	if (S_ISDIR(inode->i_mode))
-		dentry_unhash(dentry);
-
 	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
 		return -ENOTEMPTY;
 	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		if (S_ISDIR(new_dentry->d_inode->i_mode))
-			dentry_unhash(new_dentry);
-
 		res = hfs_remove(new_dir, new_dentry);
 		if (res)
 			return res;
diff --git a/trunk/fs/hfsplus/dir.c b/trunk/fs/hfsplus/dir.c
index b28835091dd0..4df5059c25da 100644
--- a/trunk/fs/hfsplus/dir.c
+++ b/trunk/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int res;
 
-	dentry_unhash(dentry);
-
 	if (inode->i_size != 2)
 		return -ENOTEMPTY;
 
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		if (S_ISDIR(new_dentry->d_inode->i_mode)) {
-			dentry_unhash(new_dentry);
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
 			res = hfsplus_rmdir(new_dir, new_dentry);
-		} else {
+		else
 			res = hfsplus_unlink(new_dir, new_dentry);
-		}
 		if (res)
 			return res;
 	}
diff --git a/trunk/fs/hostfs/hostfs_kern.c b/trunk/fs/hostfs/hostfs_kern.c
index e6816b9e6903..2638c834ed28 100644
--- a/trunk/fs/hostfs/hostfs_kern.c
+++ b/trunk/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
-	dentry_unhash(dentry);
-
 	if ((file = dentry_name(dentry)) == NULL)
 		return -ENOMEM;
 	err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	char *from_name, *to_name;
 	int err;
 
-	if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
-		dentry_unhash(to);
-
 	if ((from_name = dentry_name(from)) == NULL)
 		return -ENOMEM;
 	if ((to_name = dentry_name(to)) == NULL) {
diff --git a/trunk/fs/hpfs/namei.c b/trunk/fs/hpfs/namei.c
index ff0ce21c0867..1f05839c27a7 100644
--- a/trunk/fs/hpfs/namei.c
+++ b/trunk/fs/hpfs/namei.c
@@ -395,6 +395,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 
 		dentry_unhash(dentry);
 		if (!d_unhashed(dentry)) {
+			dput(dentry);
 			hpfs_unlock(dir->i_sb);
 			return -ENOSPC;
 		}
@@ -402,6 +403,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			d_rehash(dentry);
+			dput(dentry);
 		} else {
 			struct iattr newattrs;
 			/*printk("HPFS: truncating file before delete.\n");*/
@@ -409,6 +411,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 			err = notify_change(dentry, &newattrs);
 			put_write_access(inode);
+			dput(dentry);
 			if (!err)
 				goto again;
 		}
@@ -439,8 +442,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int r;
 
-	dentry_unhash(dentry);
-
 	hpfs_adjust_length(name, &len);
 	hpfs_lock(dir->i_sb);
 	err = -ENOENT;
@@ -534,10 +535,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh;
 	struct fnode *fnode;
 	int err;
-
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
 	err = 0;
 	hpfs_adjust_length(old_name, &old_len);
diff --git a/trunk/fs/hugetlbfs/inode.c b/trunk/fs/hugetlbfs/inode.c
index 7aafeb8fa300..e7a035781b7d 100644
--- a/trunk/fs/hugetlbfs/inode.c
+++ b/trunk/fs/hugetlbfs/inode.c
@@ -921,8 +921,7 @@ static int can_do_hugetlb_shm(void)
 	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size,
-				vm_flags_t acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
 				struct user_struct **user, int creat_flags)
 {
 	int error = -ENOMEM;
diff --git a/trunk/fs/jbd2/commit.c b/trunk/fs/jbd2/commit.c
index 7f21cf3aaf92..29148a81c783 100644
--- a/trunk/fs/jbd2/commit.c
+++ b/trunk/fs/jbd2/commit.c
@@ -219,6 +219,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 			ret = err;
 		spin_lock(&journal->j_list_lock);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
+		commit_transaction->t_flushed_data_blocks = 1;
 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		smp_mb__after_clear_bit();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -671,16 +672,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		err = 0;
 	}
 
-	write_lock(&journal->j_state_lock);
-	J_ASSERT(commit_transaction->t_state == T_COMMIT);
-	commit_transaction->t_state = T_COMMIT_DFLUSH;
-	write_unlock(&journal->j_state_lock);
 	/* 
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
 	 * the commit record
 	 */
-	if (commit_transaction->t_need_data_flush &&
+	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -757,13 +754,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                    required. */
 		JBUFFER_TRACE(jh, "file as BJ_Forget");
 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/*
-		 * Wake up any transactions which were waiting for this IO to
-		 * complete. The barrier must be here so that changes by
-		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
-		 * does the waitqueue check.
-		 */
-		smp_mb();
+		/* Wake up any transactions which were waiting for this
+		   IO to complete */
 		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
@@ -802,10 +794,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jbd2_journal_abort(journal, err);
 
 	jbd_debug(3, "JBD: commit phase 5\n");
-	write_lock(&journal->j_state_lock);
-	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
-	commit_transaction->t_state = T_COMMIT_JFLUSH;
-	write_unlock(&journal->j_state_lock);
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -961,7 +949,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	jbd_debug(3, "JBD: commit phase 7\n");
 
-	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
 	commit_transaction->t_start = jiffies;
 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/trunk/fs/jbd2/journal.c b/trunk/fs/jbd2/journal.c
index 9a7826990304..e0ec3db1c395 100644
--- a/trunk/fs/jbd2/journal.c
+++ b/trunk/fs/jbd2/journal.c
@@ -479,12 +479,9 @@ int __jbd2_log_space_left(journal_t *journal)
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
 	/*
-	 * The only transaction we can possibly wait upon is the
-	 * currently running transaction (if it exists).  Otherwise,
-	 * the target tid must be an old one.
+	 * Are we already doing a recent enough commit?
 	 */
-	if (journal->j_running_transaction &&
-	    journal->j_running_transaction->t_tid == target) {
+	if (!tid_geq(journal->j_commit_request, target)) {
 		/*
 		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
@@ -496,15 +493,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 			  journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
 		return 1;
-	} else if (!tid_geq(journal->j_commit_request, target))
-		/* This should never happen, but if it does, preserve
-		   the evidence before kjournald goes into a loop and
-		   increments j_commit_sequence beyond all recognition. */
-		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
-			  journal->j_commit_request,
-			  journal->j_commit_sequence,
-			  target, journal->j_running_transaction ? 
-			  journal->j_running_transaction->t_tid : 0);
+	}
 	return 0;
 }
 
@@ -587,47 +576,6 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 	return ret;
 }
 
-/*
- * Return 1 if a given transaction has not yet sent barrier request
- * connected with a transaction commit. If 0 is returned, transaction
- * may or may not have sent the barrier. Used to avoid sending barrier
- * twice in common cases.
- */
-int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
-{
-	int ret = 0;
-	transaction_t *commit_trans;
-
-	if (!(journal->j_flags & JBD2_BARRIER))
-		return 0;
-	read_lock(&journal->j_state_lock);
-	/* Transaction already committed? */
-	if (tid_geq(journal->j_commit_sequence, tid))
-		goto out;
-	commit_trans = journal->j_committing_transaction;
-	if (!commit_trans || commit_trans->t_tid != tid) {
-		ret = 1;
-		goto out;
-	}
-	/*
-	 * Transaction is being committed and we already proceeded to
-	 * submitting a flush to fs partition?
-	 */
-	if (journal->j_fs_dev != journal->j_dev) {
-		if (!commit_trans->t_need_data_flush ||
-		    commit_trans->t_state >= T_COMMIT_DFLUSH)
-			goto out;
-	} else {
-		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
-			goto out;
-	}
-	ret = 1;
-out:
-	read_unlock(&journal->j_state_lock);
-	return ret;
-}
-EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
-
 /*
  * Wait for a specified commit to complete.
  * The caller may not hold the journal lock.
diff --git a/trunk/fs/jbd2/transaction.c b/trunk/fs/jbd2/transaction.c
index 3eec82d32fd4..05fa77a23711 100644
--- a/trunk/fs/jbd2/transaction.c
+++ b/trunk/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  */
 
 /*
- * Update transaction's maximum wait time, if debugging is enabled.
+ * Update transiaction's maximum wait time, if debugging is enabled.
  *
  * In order for t_max_wait to be reliable, it must be protected by a
  * lock.  But doing so will mean that start_this_handle() can not be
@@ -91,10 +91,11 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  * means that maximum wait time reported by the jbd2_run_stats
  * tracepoint will always be zero.
  */
-static inline void update_t_max_wait(transaction_t *transaction,
-				     unsigned long ts)
+static inline void update_t_max_wait(transaction_t *transaction)
 {
 #ifdef CONFIG_JBD2_DEBUG
+	unsigned long ts = jiffies;
+
 	if (jbd2_journal_enable_debug &&
 	    time_after(transaction->t_start, ts)) {
 		ts = jbd2_time_diff(ts, transaction->t_start);
@@ -120,7 +121,6 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 	tid_t		tid;
 	int		needed, need_to_start;
 	int		nblocks = handle->h_buffer_credits;
-	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 	/* OK, account for the buffers that this operation expects to
 	 * use and add the handle to the running transaction. 
 	 */
-	update_t_max_wait(transaction, ts);
+	update_t_max_wait(transaction);
 	handle->h_transaction = transaction;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
@@ -316,8 +316,7 @@ static handle_t *new_handle(int nblocks)
  * This function is visible to journal users (like ext3fs), so is not
  * called with the journal already locked.
  *
- * Return a pointer to a newly allocated handle, or an ERR_PTR() value
- * on failure.
+ * Return a pointer to a newly allocated handle, or NULL on failure
  */
 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
@@ -922,8 +921,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	 */
 	JBUFFER_TRACE(jh, "cancelling revoke");
 	jbd2_journal_cancel_revoke(handle, jh);
-out:
 	jbd2_journal_put_journal_head(jh);
+out:
 	return err;
 }
 
@@ -2148,13 +2147,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 	    jinode->i_next_transaction == transaction)
 		goto done;
 
-	/*
-	 * We only ever set this variable to 1 so the test is safe. Since
-	 * t_need_data_flush is likely to be set, we do the test to save some
-	 * cacheline bouncing
-	 */
-	if (!transaction->t_need_data_flush)
-		transaction->t_need_data_flush = 1;
 	/* On some different transaction's list - should be
 	 * the committing one */
 	if (jinode->i_transaction) {
diff --git a/trunk/fs/jffs2/dir.c b/trunk/fs/jffs2/dir.c
index 05f73328b28b..82faddd1f321 100644
--- a/trunk/fs/jffs2/dir.c
+++ b/trunk/fs/jffs2/dir.c
@@ -609,8 +609,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	int ret;
 	uint32_t now = get_seconds();
 
-	dentry_unhash(dentry);
-
 	for (fd = f->dents ; fd; fd = fd->next) {
 		if (fd->ino)
 			return -ENOTEMPTY;
@@ -786,9 +784,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	uint8_t type;
 	uint32_t now;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/* The VFS will check for us and prevent trying to rename a
 	 * file over a directory and vice versa, but if it's a directory,
 	 * the VFS can't check whether the victim is empty. The filesystem
diff --git a/trunk/fs/jfs/namei.c b/trunk/fs/jfs/namei.c
index 865df16a6cf3..eaaf2b511e89 100644
--- a/trunk/fs/jfs/namei.c
+++ b/trunk/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	/* Init inode for quota operations. */
 	dquot_initialize(dip);
 	dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
 		 new_dentry->d_name.name);
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
 
diff --git a/trunk/fs/logfs/dir.c b/trunk/fs/logfs/dir.c
index f34c9cde9e94..9ed89d1663f8 100644
--- a/trunk/fs/logfs/dir.c
+++ b/trunk/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
-	dentry_unhash(dentry);
-
 	if (!logfs_empty_dir(inode))
 		return -ENOTEMPTY;
 
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
 	loff_t pos;
 	int err;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/* 1. locate source dd */
 	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
 	if (err)
diff --git a/trunk/fs/minix/namei.c b/trunk/fs/minix/namei.c
index f60aed8db9c4..6e6777f1b4b2 100644
--- a/trunk/fs/minix/namei.c
+++ b/trunk/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err = -ENOTEMPTY;
 
-	dentry_unhash(dentry);
-
 	if (minix_empty_dir(inode)) {
 		err = minix_unlink(dir, dentry);
 		if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct minix_dir_entry * old_de;
 	int err = -ENOENT;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_de = minix_find_entry(old_dentry, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/trunk/fs/mpage.c b/trunk/fs/mpage.c
index fdfae9fa98cd..0afc809e46e0 100644
--- a/trunk/fs/mpage.c
+++ b/trunk/fs/mpage.c
@@ -27,7 +27,6 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
-#include <linux/cleancache.h>
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -272,12 +271,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		SetPageMappedToDisk(page);
 	}
 
-	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
-	    cleancache_get_page(page) == 0) {
-		SetPageUptodate(page);
-		goto confused;
-	}
-
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c
index 2358b326b221..6ff858c049c0 100644
--- a/trunk/fs/namei.c
+++ b/trunk/fs/namei.c
@@ -391,28 +391,79 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/*
+/**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
  * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt).  In situations when we can't
- * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * normal reference counts on dentries and vfsmounts to transition to rcu-walk
- * mode.  Refcounts are grabbed at the last known good point before rcu-walk
- * got stuck, so ref-walk may continue from there. If this is not successful
- * (eg. a seqcount has changed), then failure is returned and it's up to caller
- * to restart the path walk from the beginning in ref-walk mode.
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
  */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *dentry = nd->path.dentry;
+	int want_root = 0;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt ||
+				nd->root.dentry != fs->root.dentry)
+			goto err_root;
+	}
+	spin_lock(&dentry->d_lock);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err;
+	BUG_ON(nd->inode != dentry->d_inode);
+	spin_unlock(&dentry->d_lock);
+	if (want_root) {
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	nd->flags &= ~LOOKUP_RCU;
+	return 0;
+err:
+	spin_unlock(&dentry->d_lock);
+err_root:
+	if (want_root)
+		spin_unlock(&fs->lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU)
+		return nameidata_drop_rcu(nd);
+	return 0;
+}
 
 /**
- * unlazy_walk - try to switch to ref-walk mode.
- * @nd: nameidata pathwalk data
- * @dentry: child of nd->path.dentry or NULL
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
  * Returns: 0 on success, -ECHILD on failure
  *
- * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
- * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
- * @nd or NULL.  Must be called from rcu-walk context.
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
  */
-static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
@@ -427,25 +478,18 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 			goto err_root;
 	}
 	spin_lock(&parent->d_lock);
-	if (!dentry) {
-		if (!__d_rcu_to_refcount(parent, nd->seq))
-			goto err_parent;
-		BUG_ON(nd->inode != parent->d_inode);
-	} else {
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-		if (!__d_rcu_to_refcount(dentry, nd->seq))
-			goto err_child;
-		/*
-		 * If the sequence check on the child dentry passed, then
-		 * the child has not been removed from its parent. This
-		 * means the parent dentry must be valid and able to take
-		 * a reference at this point.
-		 */
-		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-		BUG_ON(!parent->d_count);
-		parent->d_count++;
-		spin_unlock(&dentry->d_lock);
-	}
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err;
+	/*
+	 * If the sequence check on the child dentry passed, then the child has
+	 * not been removed from its parent. This means the parent dentry must
+	 * be valid and able to take a reference at this point.
+	 */
+	BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+	BUG_ON(!parent->d_count);
+	parent->d_count++;
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 	if (want_root) {
 		path_get(&nd->root);
@@ -457,10 +501,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 	br_read_unlock(vfsmount_lock);
 	nd->flags &= ~LOOKUP_RCU;
 	return 0;
-
-err_child:
+err:
 	spin_unlock(&dentry->d_lock);
-err_parent:
 	spin_unlock(&parent->d_lock);
 err_root:
 	if (want_root)
@@ -468,6 +510,59 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 	return -ECHILD;
 }
 
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+			nd->flags &= ~LOOKUP_RCU;
+			if (!(nd->flags & LOOKUP_ROOT))
+				nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+			return -ECHILD;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+	struct dentry *dentry = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	nd->flags &= ~LOOKUP_RCU;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+	spin_lock(&dentry->d_lock);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err_unlock;
+	BUG_ON(nd->inode != dentry->d_inode);
+	spin_unlock(&dentry->d_lock);
+
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+
+	return 0;
+
+err_unlock:
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
+}
+
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -511,39 +606,26 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-/**
- * complete_walk - successful completion of path walk
- * @nd:  pointer nameidata
+/*
+ * handle_reval_path - force revalidation of a dentry
+ *
+ * In some situations the path walking code will trust dentries without
+ * revalidating them. This causes problems for filesystems that depend on
+ * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
+ * (which indicates that it's possible for the dentry to go stale), force
+ * a d_revalidate call before proceeding.
  *
- * If we had been in RCU mode, drop out of it and legitimize nd->path.
- * Revalidate the final result, unless we'd already done that during
- * the path walk or the filesystem doesn't ask for it.  Return 0 on
- * success, -error on failure.  In case of failure caller does not
- * need to drop nd->path.
+ * Returns 0 if the revalidation was successful. If the revalidation fails,
+ * either return the error returned by d_revalidate or -ESTALE if the
+ * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
+ * invalidate the dentry. It's up to the caller to handle putting references
+ * to the path if necessary.
  */
-static int complete_walk(struct nameidata *nd)
+static inline int handle_reval_path(struct nameidata *nd)
 {
 	struct dentry *dentry = nd->path.dentry;
 	int status;
 
-	if (nd->flags & LOOKUP_RCU) {
-		nd->flags &= ~LOOKUP_RCU;
-		if (!(nd->flags & LOOKUP_ROOT))
-			nd->root.mnt = NULL;
-		spin_lock(&dentry->d_lock);
-		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
-			spin_unlock(&dentry->d_lock);
-			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
-			return -ECHILD;
-		}
-		BUG_ON(nd->inode != dentry->d_inode);
-		spin_unlock(&dentry->d_lock);
-		mntget(nd->path.mnt);
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
-
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
 		return 0;
 
@@ -561,7 +643,6 @@ static int complete_walk(struct nameidata *nd)
 	if (!status)
 		status = -ESTALE;
 
-	path_put(&nd->path);
 	return status;
 }
 
@@ -1160,8 +1241,13 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		if (likely(__follow_mount_rcu(nd, path, inode, false)))
 			return 0;
 unlazy:
-		if (unlazy_walk(nd, dentry))
-			return -ECHILD;
+		if (dentry) {
+			if (nameidata_dentry_drop_rcu(nd, dentry))
+				return -ECHILD;
+		} else {
+			if (nameidata_drop_rcu(nd))
+				return -ECHILD;
+		}
 	} else {
 		dentry = __d_lookup(parent, name);
 	}
@@ -1217,7 +1303,7 @@ static inline int may_lookup(struct nameidata *nd)
 		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
 		if (err != -ECHILD)
 			return err;
-		if (unlazy_walk(nd, NULL))
+		if (nameidata_drop_rcu(nd))
 			return -ECHILD;
 	}
 	return exec_permission(nd->inode, 0);
@@ -1271,12 +1357,8 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 		return -ENOENT;
 	}
 	if (unlikely(inode->i_op->follow_link) && follow) {
-		if (nd->flags & LOOKUP_RCU) {
-			if (unlikely(unlazy_walk(nd, path->dentry))) {
-				terminate_walk(nd);
-				return -ECHILD;
-			}
-		}
+		if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+			return -ECHILD;
 		BUG_ON(inode != path->dentry->d_inode);
 		return 1;
 	}
@@ -1575,8 +1657,18 @@ static int path_lookupat(int dfd, const char *name,
 		}
 	}
 
-	if (!err)
-		err = complete_walk(nd);
+	if (nd->flags & LOOKUP_RCU) {
+		/* went all way through without dropping RCU */
+		BUG_ON(err);
+		if (nameidata_drop_rcu_last(nd))
+			err = -ECHILD;
+	}
+
+	if (!err) {
+		err = handle_reval_path(nd);
+		if (err)
+			path_put(&nd->path);
+	}
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
 		if (!nd->inode->i_op->lookup) {
@@ -2042,9 +2134,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			return ERR_PTR(error);
 		/* fallthrough */
 	case LAST_ROOT:
-		error = complete_walk(nd);
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
+		error = handle_reval_path(nd);
 		if (error)
-			return ERR_PTR(error);
+			goto exit;
 		audit_inode(pathname, nd->path.dentry);
 		if (open_flag & O_CREAT) {
 			error = -EISDIR;
@@ -2052,9 +2148,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		goto ok;
 	case LAST_BIND:
-		error = complete_walk(nd);
+		/* can't be RCU mode here */
+		error = handle_reval_path(nd);
 		if (error)
-			return ERR_PTR(error);
+			goto exit;
 		audit_inode(pathname, dir);
 		goto ok;
 	}
@@ -2073,9 +2170,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error) /* symlink */
 			return NULL;
 		/* sayonara */
-		error = complete_walk(nd);
-		if (error)
-			return ERR_PTR(-ECHILD);
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
 
 		error = -ENOTDIR;
 		if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2087,9 +2185,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	}
 
 	/* create side of things */
-	error = complete_walk(nd);
-	if (error)
-		return ERR_PTR(error);
+
+	if (nd->flags & LOOKUP_RCU) {
+		if (nameidata_drop_rcu_last(nd))
+			return ERR_PTR(-ECHILD);
+	}
 
 	audit_inode(pathname, dir);
 	error = -EISDIR;
@@ -2529,10 +2629,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 
 /*
- * The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning the dcache),
- * then we drop the dentry now.
+ * We try to drop the dentry early: we should have
+ * a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning
+ * the dcache), then we drop the dentry now.
  *
  * A low-level filesystem can, if it choses, legally
  * do a
@@ -2545,9 +2645,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
  */
 void dentry_unhash(struct dentry *dentry)
 {
+	dget(dentry);
 	shrink_dcache_parent(dentry);
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_count == 1)
+	if (dentry->d_count == 2)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
@@ -2563,26 +2664,25 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EPERM;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-
-	error = -EBUSY;
+	dentry_unhash(dentry);
 	if (d_mountpoint(dentry))
-		goto out;
-
-	error = security_inode_rmdir(dir, dentry);
-	if (error)
-		goto out;
-
-	error = dir->i_op->rmdir(dir, dentry);
-	if (error)
-		goto out;
-
-	dentry->d_inode->i_flags |= S_DEAD;
-	dont_mount(dentry);
-
-out:
+		error = -EBUSY;
+	else {
+		error = security_inode_rmdir(dir, dentry);
+		if (!error) {
+			error = dir->i_op->rmdir(dir, dentry);
+			if (!error) {
+				dentry->d_inode->i_flags |= S_DEAD;
+				dont_mount(dentry);
+			}
+		}
+	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
-	if (!error)
+	if (!error) {
 		d_delete(dentry);
+	}
+	dput(dentry);
+
 	return error;
 }
 
@@ -2953,7 +3053,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *	   we'd better make sure that there's no link(2) for them.
- *	d) conversion from fhandle to dentry may come in the wrong moment - when
+ *	d) some filesystems don't support opened-but-unlinked directories,
+ *	   either because of layout or because they are not ready to deal with
+ *	   all cases correctly. The latter will be fixed (taking this sort of
+ *	   stuff into VFS), but the former is not going away. Solution: the same
+ *	   trick as in rmdir().
+ *	e) conversion from fhandle to dentry may come in the wrong moment - when
  *	   we are removing the target. Solution: we will have to grab ->i_mutex
  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
  *	   ->i_mutex on parents, which works but leads to some truly excessive
@@ -2963,7 +3068,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry)
 {
 	int error = 0;
-	struct inode *target = new_dentry->d_inode;
+	struct inode *target;
 
 	/*
 	 * If we are going to change the parent - check write permissions,
@@ -2979,24 +3084,26 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
+	target = new_dentry->d_inode;
 	if (target)
 		mutex_lock(&target->i_mutex);
-
-	error = -EBUSY;
-	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
-		goto out;
-
-	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-	if (error)
-		goto out;
-
-	if (target) {
-		target->i_flags |= S_DEAD;
-		dont_mount(new_dentry);
+	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
+		error = -EBUSY;
+	else {
+		if (target)
+			dentry_unhash(new_dentry);
+		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 	}
-out:
-	if (target)
+	if (target) {
+		if (!error) {
+			target->i_flags |= S_DEAD;
+			dont_mount(new_dentry);
+		}
 		mutex_unlock(&target->i_mutex);
+		if (d_unhashed(new_dentry))
+			d_rehash(new_dentry);
+		dput(new_dentry);
+	}
 	if (!error)
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 			d_move(old_dentry,new_dentry);
@@ -3006,7 +3113,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct inode *target = new_dentry->d_inode;
+	struct inode *target;
 	int error;
 
 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3014,22 +3121,19 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	dget(new_dentry);
+	target = new_dentry->d_inode;
 	if (target)
 		mutex_lock(&target->i_mutex);
-
-	error = -EBUSY;
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-		goto out;
-
-	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-	if (error)
-		goto out;
-
-	if (target)
-		dont_mount(new_dentry);
-	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-		d_move(old_dentry, new_dentry);
-out:
+		error = -EBUSY;
+	else
+		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (!error) {
+		if (target)
+			dont_mount(new_dentry);
+		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+			d_move(old_dentry, new_dentry);
+	}
 	if (target)
 		mutex_unlock(&target->i_mutex);
 	dput(new_dentry);
diff --git a/trunk/fs/namespace.c b/trunk/fs/namespace.c
index fe59bd145d21..d99bcf59e4c2 100644
--- a/trunk/fs/namespace.c
+++ b/trunk/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 
 static int flags_to_propagation_type(int flags)
 {
-	int type = flags & ~(MS_REC | MS_SILENT);
+	int type = flags & ~MS_REC;
 
 	/* Fail if any non-propagation flags are set */
 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/trunk/fs/ncpfs/dir.c b/trunk/fs/ncpfs/dir.c
index e3e646b06404..f6946bb5cb55 100644
--- a/trunk/fs/ncpfs/dir.c
+++ b/trunk/fs/ncpfs/dir.c
@@ -1033,8 +1033,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	DPRINTK("ncp_rmdir: removing %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	error = -EBUSY;
 	if (!d_unhashed(dentry))
 		goto out;
@@ -1141,9 +1139,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
 
diff --git a/trunk/fs/nilfs2/namei.c b/trunk/fs/nilfs2/namei.c
index 1102a5fbb744..546849b3e88f 100644
--- a/trunk/fs/nilfs2/namei.c
+++ b/trunk/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct nilfs_transaction_info ti;
 	int err;
 
-	dentry_unhash(dentry);
-
 	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
 	if (err)
 		return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct nilfs_transaction_info ti;
 	int err;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
 	if (unlikely(err))
 		return err;
diff --git a/trunk/fs/ocfs2/Makefile b/trunk/fs/ocfs2/Makefile
index f17e58b32989..d8a0313e99e6 100644
--- a/trunk/fs/ocfs2/Makefile
+++ b/trunk/fs/ocfs2/Makefile
@@ -30,7 +30,6 @@ ocfs2-objs := \
 	namei.o 		\
 	refcounttree.o		\
 	reservations.o		\
-	move_extents.o		\
 	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
diff --git a/trunk/fs/ocfs2/alloc.c b/trunk/fs/ocfs2/alloc.c
index ed553c60de82..48aa9c7401c7 100644
--- a/trunk/fs/ocfs2/alloc.c
+++ b/trunk/fs/ocfs2/alloc.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
-#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -7185,168 +7184,3 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 out:
 	return ret;
 }
-
-static int ocfs2_trim_extent(struct super_block *sb,
-			     struct ocfs2_group_desc *gd,
-			     u32 start, u32 count)
-{
-	u64 discard, bcount;
-
-	bcount = ocfs2_clusters_to_blocks(sb, count);
-	discard = le64_to_cpu(gd->bg_blkno) +
-			ocfs2_clusters_to_blocks(sb, start);
-
-	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
-
-	return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
-}
-
-static int ocfs2_trim_group(struct super_block *sb,
-			    struct ocfs2_group_desc *gd,
-			    u32 start, u32 max, u32 minbits)
-{
-	int ret = 0, count = 0, next;
-	void *bitmap = gd->bg_bitmap;
-
-	if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
-		return 0;
-
-	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
-			       start, max, minbits);
-
-	while (start < max) {
-		start = ocfs2_find_next_zero_bit(bitmap, max, start);
-		if (start >= max)
-			break;
-		next = ocfs2_find_next_bit(bitmap, max, start);
-
-		if ((next - start) >= minbits) {
-			ret = ocfs2_trim_extent(sb, gd,
-						start, next - start);
-			if (ret < 0) {
-				mlog_errno(ret);
-				break;
-			}
-			count += next - start;
-		}
-		start = next + 1;
-
-		if (fatal_signal_pending(current)) {
-			count = -ERESTARTSYS;
-			break;
-		}
-
-		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
-			break;
-	}
-
-	if (ret < 0)
-		count = ret;
-
-	return count;
-}
-
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
-{
-	struct ocfs2_super *osb = OCFS2_SB(sb);
-	u64 start, len, trimmed, first_group, last_group, group;
-	int ret, cnt;
-	u32 first_bit, last_bit, minlen;
-	struct buffer_head *main_bm_bh = NULL;
-	struct inode *main_bm_inode = NULL;
-	struct buffer_head *gd_bh = NULL;
-	struct ocfs2_dinode *main_bm;
-	struct ocfs2_group_desc *gd = NULL;
-
-	start = range->start >> osb->s_clustersize_bits;
-	len = range->len >> osb->s_clustersize_bits;
-	minlen = range->minlen >> osb->s_clustersize_bits;
-	trimmed = 0;
-
-	if (!len) {
-		range->len = 0;
-		return 0;
-	}
-
-	if (minlen >= osb->bitmap_cpg)
-		return -EINVAL;
-
-	main_bm_inode = ocfs2_get_system_file_inode(osb,
-						    GLOBAL_BITMAP_SYSTEM_INODE,
-						    OCFS2_INVALID_SLOT);
-	if (!main_bm_inode) {
-		ret = -EIO;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	mutex_lock(&main_bm_inode->i_mutex);
-
-	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_mutex;
-	}
-	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
-
-	if (start >= le32_to_cpu(main_bm->i_clusters)) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	if (start + len > le32_to_cpu(main_bm->i_clusters))
-		len = le32_to_cpu(main_bm->i_clusters) - start;
-
-	trace_ocfs2_trim_fs(start, len, minlen);
-
-	/* Determine first and last group to examine based on start and len */
-	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
-	if (first_group == osb->first_cluster_group_blkno)
-		first_bit = start;
-	else
-		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
-	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
-	last_bit = osb->bitmap_cpg;
-
-	for (group = first_group; group <= last_group;) {
-		if (first_bit + len >= osb->bitmap_cpg)
-			last_bit = osb->bitmap_cpg;
-		else
-			last_bit = first_bit + len;
-
-		ret = ocfs2_read_group_descriptor(main_bm_inode,
-						  main_bm, group,
-						  &gd_bh);
-		if (ret < 0) {
-			mlog_errno(ret);
-			break;
-		}
-
-		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
-		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
-		brelse(gd_bh);
-		gd_bh = NULL;
-		if (cnt < 0) {
-			ret = cnt;
-			mlog_errno(ret);
-			break;
-		}
-
-		trimmed += cnt;
-		len -= osb->bitmap_cpg - first_bit;
-		first_bit = 0;
-		if (group == osb->first_cluster_group_blkno)
-			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
-		else
-			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
-	}
-	range->len = trimmed * sb->s_blocksize;
-out_unlock:
-	ocfs2_inode_unlock(main_bm_inode, 0);
-	brelse(main_bm_bh);
-out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
-	iput(main_bm_inode);
-out:
-	return ret;
-}
diff --git a/trunk/fs/ocfs2/alloc.h b/trunk/fs/ocfs2/alloc.h
index ca381c584127..3bd08a03251c 100644
--- a/trunk/fs/ocfs2/alloc.h
+++ b/trunk/fs/ocfs2/alloc.h
@@ -239,7 +239,6 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
 		    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
  * Helper function to look at the # of clusters in an extent record.
  */
diff --git a/trunk/fs/ocfs2/cluster/sys.c b/trunk/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..bc702dab5d1f 100644
--- a/trunk/fs/ocfs2/cluster/sys.c
+++ b/trunk/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
 	mlog_sys_shutdown();
+	sysfs_remove_link(NULL, "o2cb");
 	kset_unregister(o2cb_kset);
 }
 
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
 	if (!o2cb_kset)
 		return -ENOMEM;
 
+	/*
+	 * Create this symlink for backwards compatibility with old
+	 * versions of ocfs2-tools which look for things in /sys/o2cb.
+	 */
+	ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
+	if (ret)
+		goto error;
+
 	ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
 	if (ret)
 		goto error;
diff --git a/trunk/fs/ocfs2/dlm/dlmcommon.h b/trunk/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b61..4bdf7baee344 100644
--- a/trunk/fs/ocfs2/dlm/dlmcommon.h
+++ b/trunk/fs/ocfs2/dlm/dlmcommon.h
@@ -144,7 +144,6 @@ struct dlm_ctxt
 	wait_queue_head_t dlm_join_events;
 	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-	unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct dlm_recovery_ctxt reco;
 	spinlock_t master_lock;
@@ -402,18 +401,6 @@ static inline int dlm_lvb_is_empty(char *lvb)
 	return 1;
 }
 
-static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
-{
-	if (idx == DLM_GRANTED_LIST)
-		return "granted";
-	else if (idx == DLM_CONVERTING_LIST)
-		return "converting";
-	else if (idx == DLM_BLOCKED_LIST)
-		return "blocked";
-	else
-		return "unknown";
-}
-
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -461,7 +448,6 @@ enum {
 	DLM_FINALIZE_RECO_MSG		= 518,
 	DLM_QUERY_REGION		= 519,
 	DLM_QUERY_NODEINFO		= 520,
-	DLM_BEGIN_EXIT_DOMAIN_MSG	= 521,
 };
 
 struct dlm_reco_node_data
diff --git a/trunk/fs/ocfs2/dlm/dlmdebug.c b/trunk/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e3..04a32be0aeb9 100644
--- a/trunk/fs/ocfs2/dlm/dlmdebug.c
+++ b/trunk/fs/ocfs2/dlm/dlmdebug.c
@@ -756,12 +756,6 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 				 buf + out, len - out);
 	out += snprintf(buf + out, len - out, "\n");
 
-	/* Exit Domain Map: xx xx xx */
-	out += snprintf(buf + out, len - out, "Exit Domain Map: ");
-	out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
-
 	/* Live Map: xx xx xx */
 	out += snprintf(buf + out, len - out, "Live Map: ");
 	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/trunk/fs/ocfs2/dlm/dlmdomain.c b/trunk/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf93..3b179d6cbde0 100644
--- a/trunk/fs/ocfs2/dlm/dlmdomain.c
+++ b/trunk/fs/ocfs2/dlm/dlmdomain.c
@@ -132,12 +132,10 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  * New in version 1.1:
  *	- Message DLM_QUERY_REGION added to support global heartbeat
  *	- Message DLM_QUERY_NODEINFO added to allow online node removes
- * New in version 1.2:
- * 	- Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
  */
 static const struct dlm_protocol_version dlm_protocol = {
 	.pv_major = 1,
-	.pv_minor = 2,
+	.pv_minor = 1,
 };
 
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -451,18 +449,14 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 			dropped = dlm_empty_lockres(dlm, res);
 
 			spin_lock(&res->spinlock);
-			if (dropped)
-				__dlm_lockres_calc_usage(dlm, res);
-			else
-				iter = res->hash_node.next;
+			__dlm_lockres_calc_usage(dlm, res);
+			iter = res->hash_node.next;
 			spin_unlock(&res->spinlock);
 
 			dlm_lockres_put(res);
 
-			if (dropped) {
-				cond_resched_lock(&dlm->spinlock);
+			if (dropped)
 				goto redo_bucket;
-			}
 		}
 		cond_resched_lock(&dlm->spinlock);
 		num += n;
@@ -492,28 +486,6 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
 	return ret;
 }
 
-static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
-					 void *data, void **ret_data)
-{
-	struct dlm_ctxt *dlm = data;
-	unsigned int node;
-	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
-
-	if (!dlm_grab(dlm))
-		return 0;
-
-	node = exit_msg->node_idx;
-	mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
-
-	spin_lock(&dlm->spinlock);
-	set_bit(node, dlm->exit_domain_map);
-	spin_unlock(&dlm->spinlock);
-
-	dlm_put(dlm);
-
-	return 0;
-}
-
 static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
 {
 	/* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -570,7 +542,6 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	spin_lock(&dlm->spinlock);
 	clear_bit(node, dlm->domain_map);
-	clear_bit(node, dlm->exit_domain_map);
 	__dlm_print_nodes(dlm);
 
 	/* notify anything attached to the heartbeat events */
@@ -583,56 +554,29 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 	return 0;
 }
 
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
 				    unsigned int node)
 {
 	int status;
 	struct dlm_exit_domain leave_msg;
 
-	mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
-	     msg_type, node);
+	mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
+		  node, dlm->name, dlm->node_num);
 
 	memset(&leave_msg, 0, sizeof(leave_msg));
 	leave_msg.node_idx = dlm->node_num;
 
-	status = o2net_send_message(msg_type, dlm->key, &leave_msg,
-				    sizeof(leave_msg), node, NULL);
+	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
+				    &leave_msg, sizeof(leave_msg), node,
+				    NULL);
 	if (status < 0)
-		mlog(ML_ERROR, "Error %d sending domain exit message %u "
-		     "to node %u on domain %s\n", status, msg_type, node,
-		     dlm->name);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
+	mlog(0, "status return %d from o2net_send_message\n", status);
 
 	return status;
 }
 
-static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
-{
-	int node = -1;
-
-	/* Support for begin exit domain was added in 1.2 */
-	if (dlm->dlm_locking_proto.pv_major == 1 &&
-	    dlm->dlm_locking_proto.pv_minor < 2)
-		return;
-
-	/*
-	 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
-	 * informational. Meaning if a node does not receive the message,
-	 * so be it.
-	 */
-	spin_lock(&dlm->spinlock);
-	while (1) {
-		node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
-		if (node >= O2NM_MAX_NODES)
-			break;
-		if (node == dlm->node_num)
-			continue;
-
-		spin_unlock(&dlm->spinlock);
-		dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
-		spin_lock(&dlm->spinlock);
-	}
-	spin_unlock(&dlm->spinlock);
-}
 
 static void dlm_leave_domain(struct dlm_ctxt *dlm)
 {
@@ -658,8 +602,7 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
 
 		clear_node = 1;
 
-		status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
-						  node);
+		status = dlm_send_one_domain_exit(dlm, node);
 		if (status < 0 &&
 		    status != -ENOPROTOOPT &&
 		    status != -ENOTCONN) {
@@ -734,7 +677,6 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 
 	if (leave) {
 		mlog(0, "shutting down domain %s\n", dlm->name);
-		dlm_begin_exit_domain(dlm);
 
 		/* We changed dlm state, notify the thread */
 		dlm_kick_thread(dlm, NULL);
@@ -967,7 +909,6 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 		 * leftover join state. */
 		BUG_ON(dlm->joining_node != assert->node_idx);
 		set_bit(assert->node_idx, dlm->domain_map);
-		clear_bit(assert->node_idx, dlm->exit_domain_map);
 		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
 		printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1852,13 +1793,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 	if (status)
 		goto bail;
 
-	status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
-					sizeof(struct dlm_exit_domain),
-					dlm_begin_exit_domain_handler,
-					dlm, NULL, &dlm->dlm_domain_handlers);
-	if (status)
-		goto bail;
-
 bail:
 	if (status)
 		dlm_unregister_domain_handlers(dlm);
diff --git a/trunk/fs/ocfs2/dlm/dlmmaster.c b/trunk/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e9..84d166328cf7 100644
--- a/trunk/fs/ocfs2/dlm/dlmmaster.c
+++ b/trunk/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,55 +2339,65 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 	dlm_lockres_put(res);
 }
 
-/*
- * A migrateable resource is one that is :
- * 1. locally mastered, and,
- * 2. zero local locks, and,
- * 3. one or more non-local locks, or, one or more references
- * Returns 1 if yes, 0 if not.
+/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
+ * if not. If 0, numlocks is set to the number of locks in the lockres.
  */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
-				      struct dlm_lock_resource *res)
+				      struct dlm_lock_resource *res,
+				      int *numlocks,
+				      int *hasrefs)
 {
-	enum dlm_lockres_list idx;
-	int nonlocal = 0, node_ref;
+	int ret;
+	int i;
+	int count = 0;
 	struct list_head *queue;
 	struct dlm_lock *lock;
-	u64 cookie;
 
 	assert_spin_locked(&res->spinlock);
 
-	if (res->owner != dlm->node_num)
-		return 0;
+	*numlocks = 0;
+	*hasrefs = 0;
 
-        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
-		queue = dlm_list_idx_to_ptr(res, idx);
+	ret = -EINVAL;
+	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "cannot migrate lockres with unknown owner!\n");
+		goto leave;
+	}
+
+	if (res->owner != dlm->node_num) {
+		mlog(0, "cannot migrate lockres this node doesn't own!\n");
+		goto leave;
+	}
+
+	ret = 0;
+	queue = &res->granted;
+	for (i = 0; i < 3; i++) {
 		list_for_each_entry(lock, queue, list) {
-			if (lock->ml.node != dlm->node_num) {
-				nonlocal++;
-				continue;
+			++count;
+			if (lock->ml.node == dlm->node_num) {
+				mlog(0, "found a lock owned by this node still "
+				     "on the %s queue!  will not migrate this "
+				     "lockres\n", (i == 0 ? "granted" :
+						   (i == 1 ? "converting" :
+						    "blocked")));
+				ret = -ENOTEMPTY;
+				goto leave;
 			}
-			cookie = be64_to_cpu(lock->ml.cookie);
-			mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
-			     "%s list\n", dlm->name, res->lockname.len,
-			     res->lockname.name,
-			     dlm_get_lock_cookie_node(cookie),
-			     dlm_get_lock_cookie_seq(cookie),
-			     dlm_list_in_text(idx));
-			return 0;
 		}
+		queue++;
 	}
 
-	if (!nonlocal) {
-		node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-		if (node_ref >= O2NM_MAX_NODES)
-			return 0;
-	}
+	*numlocks = count;
 
-	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
-	     res->lockname.name);
+	count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+	if (count < O2NM_MAX_NODES)
+		*hasrefs = 1;
 
-	return 1;
+	mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
+
+leave:
+	return ret;
 }
 
 /*
@@ -2396,7 +2406,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
 
 
 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-			       struct dlm_lock_resource *res, u8 target)
+			       struct dlm_lock_resource *res,
+			       u8 target)
 {
 	struct dlm_master_list_entry *mle = NULL;
 	struct dlm_master_list_entry *oldmle = NULL;
@@ -2405,20 +2416,37 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	const char *name;
 	unsigned int namelen;
 	int mle_added = 0;
+	int numlocks, hasrefs;
 	int wake = 0;
 
 	if (!dlm_grab(dlm))
 		return -EINVAL;
 
-	BUG_ON(target == O2NM_MAX_NODES);
-
 	name = res->lockname.name;
 	namelen = res->lockname.len;
 
-	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
-	     target);
+	mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
+
+	/*
+	 * ensure this lockres is a proper candidate for migration
+	 */
+	spin_lock(&res->spinlock);
+	ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
+	if (ret < 0) {
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	spin_unlock(&res->spinlock);
+
+	/* no work to do */
+	if (numlocks == 0 && !hasrefs)
+		goto leave;
+
+	/*
+	 * preallocate up front
+	 * if this fails, abort
+	 */
 
-	/* preallocate up front. if this fails, abort */
 	ret = -ENOMEM;
 	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
 	if (!mres) {
@@ -2433,11 +2461,36 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	}
 	ret = 0;
 
+	/*
+	 * find a node to migrate the lockres to
+	 */
+
+	spin_lock(&dlm->spinlock);
+	/* pick a new node */
+	if (!test_bit(target, dlm->domain_map) ||
+	    target >= O2NM_MAX_NODES) {
+		target = dlm_pick_migration_target(dlm, res);
+	}
+	mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+	     namelen, name, target);
+
+	if (target >= O2NM_MAX_NODES ||
+	    !test_bit(target, dlm->domain_map)) {
+		/* target chosen is not alive */
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		spin_unlock(&dlm->spinlock);
+		goto fail;
+	}
+
+	mlog(0, "continuing with target = %u\n", target);
+
 	/*
 	 * clear any existing master requests and
 	 * add the migration mle to the list
 	 */
-	spin_lock(&dlm->spinlock);
 	spin_lock(&dlm->master_lock);
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
 				    namelen, target, dlm->node_num);
@@ -2478,7 +2531,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 			dlm_put_mle(mle);
 		} else if (mle) {
 			kmem_cache_free(dlm_mle_cache, mle);
-			mle = NULL;
 		}
 		goto leave;
 	}
@@ -2600,52 +2652,69 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	if (wake)
 		wake_up(&res->wq);
 
+	/* TODO: cleanup */
 	if (mres)
 		free_page((unsigned long)mres);
 
 	dlm_put(dlm);
 
-	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
-	     name, target, ret);
+	mlog(0, "returning %d\n", ret);
 	return ret;
 }
 
 #define DLM_MIGRATION_RETRY_MS  100
 
-/*
- * Should be called only after beginning the domain leave process.
+/* Should be called only after beginning the domain leave process.
  * There should not be any remaining locks on nonlocal lock resources,
  * and there should be no local locks left on locally mastered resources.
  *
  * Called with the dlm spinlock held, may drop it to do migration, but
  * will re-acquire before exit.
  *
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
- */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
 	int ret;
 	int lock_dropped = 0;
-	u8 target = O2NM_MAX_NODES;
-
-	assert_spin_locked(&dlm->spinlock);
+	int numlocks, hasrefs;
 
 	spin_lock(&res->spinlock);
-	if (dlm_is_lockres_migrateable(dlm, res))
-		target = dlm_pick_migration_target(dlm, res);
-	spin_unlock(&res->spinlock);
+	if (res->owner != dlm->node_num) {
+		if (!__dlm_lockres_unused(res)) {
+			mlog(ML_ERROR, "%s:%.*s: this node is not master, "
+			     "trying to free this but locks remain\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+		}
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
 
-	if (target == O2NM_MAX_NODES)
+	/* No need to migrate a lockres having no locks */
+	ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
+	if (ret >= 0 && numlocks == 0 && !hasrefs) {
+		spin_unlock(&res->spinlock);
 		goto leave;
+	}
+	spin_unlock(&res->spinlock);
 
 	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
 	spin_unlock(&dlm->spinlock);
 	lock_dropped = 1;
-	ret = dlm_migrate_lockres(dlm, res, target);
-	if (ret)
-		mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
-		     dlm->name, res->lockname.len, res->lockname.name,
-		     target, ret);
+	while (1) {
+		ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
+		if (ret >= 0)
+			break;
+		if (ret == -ENOTEMPTY) {
+			mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
+		     		res->lockname.len, res->lockname.name);
+			BUG();
+		}
+
+		mlog(0, "lockres %.*s: migrate failed, "
+		     "retrying\n", res->lockname.len,
+		     res->lockname.name);
+		msleep(DLM_MIGRATION_RETRY_MS);
+	}
 	spin_lock(&dlm->spinlock);
 leave:
 	return lock_dropped;
@@ -2829,55 +2898,61 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 	}
 }
 
-/*
- * Pick a node to migrate the lock resource to. This function selects a
- * potential target based first on the locks and then on refmap. It skips
- * nodes that are in the process of exiting the domain.
- */
+/* for now this is not too intelligent.  we will
+ * need stats to make this do the right thing.
+ * this just finds the first lock on one of the
+ * queues and uses that node as the target. */
 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 				    struct dlm_lock_resource *res)
 {
-	enum dlm_lockres_list idx;
+	int i;
 	struct list_head *queue = &res->granted;
 	struct dlm_lock *lock;
-	int noderef;
-	u8 nodenum = O2NM_MAX_NODES;
+	int nodenum;
 
 	assert_spin_locked(&dlm->spinlock);
-	assert_spin_locked(&res->spinlock);
 
-	/* Go through all the locks */
-	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
-		queue = dlm_list_idx_to_ptr(res, idx);
+	spin_lock(&res->spinlock);
+	for (i=0; i<3; i++) {
 		list_for_each_entry(lock, queue, list) {
-			if (lock->ml.node == dlm->node_num)
-				continue;
-			if (test_bit(lock->ml.node, dlm->exit_domain_map))
-				continue;
-			nodenum = lock->ml.node;
-			goto bail;
+			/* up to the caller to make sure this node
+			 * is alive */
+			if (lock->ml.node != dlm->node_num) {
+				spin_unlock(&res->spinlock);
+				return lock->ml.node;
+			}
 		}
+		queue++;
+	}
+
+	nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+	if (nodenum < O2NM_MAX_NODES) {
+		spin_unlock(&res->spinlock);
+		return nodenum;
 	}
+	spin_unlock(&res->spinlock);
+	mlog(0, "have not found a suitable target yet! checking domain map\n");
 
-	/* Go thru the refmap */
-	noderef = -1;
+	/* ok now we're getting desperate.  pick anyone alive. */
+	nodenum = -1;
 	while (1) {
-		noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
-					noderef + 1);
-		if (noderef >= O2NM_MAX_NODES)
+		nodenum = find_next_bit(dlm->domain_map,
+					O2NM_MAX_NODES, nodenum+1);
+		mlog(0, "found %d in domain map\n", nodenum);
+		if (nodenum >= O2NM_MAX_NODES)
 			break;
-		if (noderef == dlm->node_num)
-			continue;
-		if (test_bit(noderef, dlm->exit_domain_map))
-			continue;
-		nodenum = noderef;
-		goto bail;
+		if (nodenum != dlm->node_num) {
+			mlog(0, "picking %d\n", nodenum);
+			return nodenum;
+		}
 	}
 
-bail:
-	return nodenum;
+	mlog(0, "giving up.  no master to migrate to\n");
+	return DLM_LOCK_RES_OWNER_UNKNOWN;
 }
 
+
+
 /* this is called by the new master once all lockres
  * data has been received */
 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/trunk/fs/ocfs2/dlm/dlmrecovery.c b/trunk/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a21..f1beb6fc254d 100644
--- a/trunk/fs/ocfs2/dlm/dlmrecovery.c
+++ b/trunk/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,7 +2393,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 
 	mlog(0, "node %u being removed from domain map!\n", idx);
 	clear_bit(idx, dlm->domain_map);
-	clear_bit(idx, dlm->exit_domain_map);
 	/* wake up migration waiters if a node goes down.
 	 * perhaps later we can genericize this for other waiters. */
 	wake_up(&dlm->migration_wq);
diff --git a/trunk/fs/ocfs2/dlmfs/dlmfs.c b/trunk/fs/ocfs2/dlmfs/dlmfs.c
index b42076797049..8c5c0eddc365 100644
--- a/trunk/fs/ocfs2/dlmfs/dlmfs.c
+++ b/trunk/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
  *		  signifies a bast fired on the lock.
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
-static int param_set_dlmfs_capabilities(const char *val,
+extern int param_set_dlmfs_capabilities(const char *val,
 					struct kernel_param *kp)
 {
 	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/trunk/fs/ocfs2/file.c b/trunk/fs/ocfs2/file.c
index b1e35a392ca5..89659d6dc206 100644
--- a/trunk/fs/ocfs2/file.c
+++ b/trunk/fs/ocfs2/file.c
@@ -2670,7 +2670,6 @@ const struct file_operations ocfs2_fops_no_plocks = {
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
-	.fallocate	= ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/trunk/fs/ocfs2/ioctl.c b/trunk/fs/ocfs2/ioctl.c
index bc91072b7219..8f13c5989eae 100644
--- a/trunk/fs/ocfs2/ioctl.c
+++ b/trunk/fs/ocfs2/ioctl.c
@@ -22,11 +22,6 @@
 #include "ioctl.h"
 #include "resize.h"
 #include "refcounttree.h"
-#include "sysfile.h"
-#include "dir.h"
-#include "buffer_head_io.h"
-#include "suballoc.h"
-#include "move_extents.h"
 
 #include <linux/ext2_fs.h>
 
@@ -40,27 +35,31 @@
  * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
  * just a best-effort to tell userspace that this request caused the error.
  */
-static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
 					struct ocfs2_info_request __user *req)
 {
 	kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
 	(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
 }
 
-static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
+#define o2info_set_request_error(a, b) \
+		__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
+static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
 {
 	req->ir_flags |= OCFS2_INFO_FL_FILLED;
 }
 
-static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
+#define o2info_set_request_filled(a) \
+		__o2info_set_request_filled((struct ocfs2_info_request *)&(a))
+
+static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
 {
 	req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
 }
 
-static inline int o2info_coherent(struct ocfs2_info_request *req)
-{
-	return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
-}
+#define o2info_clear_request_filled(a) \
+		__o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
 
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
@@ -154,7 +153,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
 
 	oib.ib_blocksize = inode->i_sb->s_blocksize;
 
-	o2info_set_request_filled(&oib.ib_req);
+	o2info_set_request_filled(oib);
 
 	if (o2info_to_user(oib, req))
 		goto bail;
@@ -162,7 +161,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oib.ib_req, req);
+		o2info_set_request_error(oib, req);
 
 	return status;
 }
@@ -179,7 +178,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
 
 	oic.ic_clustersize = osb->s_clustersize;
 
-	o2info_set_request_filled(&oic.ic_req);
+	o2info_set_request_filled(oic);
 
 	if (o2info_to_user(oic, req))
 		goto bail;
@@ -187,7 +186,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oic.ic_req, req);
+		o2info_set_request_error(oic, req);
 
 	return status;
 }
@@ -204,7 +203,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
 
 	oim.im_max_slots = osb->max_slots;
 
-	o2info_set_request_filled(&oim.im_req);
+	o2info_set_request_filled(oim);
 
 	if (o2info_to_user(oim, req))
 		goto bail;
@@ -212,7 +211,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oim.im_req, req);
+		o2info_set_request_error(oim, req);
 
 	return status;
 }
@@ -229,7 +228,7 @@ int ocfs2_info_handle_label(struct inode *inode,
 
 	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
 
-	o2info_set_request_filled(&oil.il_req);
+	o2info_set_request_filled(oil);
 
 	if (o2info_to_user(oil, req))
 		goto bail;
@@ -237,7 +236,7 @@ int ocfs2_info_handle_label(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oil.il_req, req);
+		o2info_set_request_error(oil, req);
 
 	return status;
 }
@@ -254,7 +253,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
 
 	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
 
-	o2info_set_request_filled(&oiu.iu_req);
+	o2info_set_request_filled(oiu);
 
 	if (o2info_to_user(oiu, req))
 		goto bail;
@@ -262,7 +261,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oiu.iu_req, req);
+		o2info_set_request_error(oiu, req);
 
 	return status;
 }
@@ -281,7 +280,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
 	oif.if_incompat_features = osb->s_feature_incompat;
 	oif.if_ro_compat_features = osb->s_feature_ro_compat;
 
-	o2info_set_request_filled(&oif.if_req);
+	o2info_set_request_filled(oif);
 
 	if (o2info_to_user(oif, req))
 		goto bail;
@@ -289,7 +288,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oif.if_req, req);
+		o2info_set_request_error(oif, req);
 
 	return status;
 }
@@ -306,7 +305,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
 
 	oij.ij_journal_size = osb->journal->j_inode->i_size;
 
-	o2info_set_request_filled(&oij.ij_req);
+	o2info_set_request_filled(oij);
 
 	if (o2info_to_user(oij, req))
 		goto bail;
@@ -314,408 +313,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oij.ij_req, req);
-
-	return status;
-}
-
-int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
-				struct inode *inode_alloc, u64 blkno,
-				struct ocfs2_info_freeinode *fi, u32 slot)
-{
-	int status = 0, unlock = 0;
-
-	struct buffer_head *bh = NULL;
-	struct ocfs2_dinode *dinode_alloc = NULL;
-
-	if (inode_alloc)
-		mutex_lock(&inode_alloc->i_mutex);
-
-	if (o2info_coherent(&fi->ifi_req)) {
-		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-		unlock = 1;
-	} else {
-		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-	}
-
-	dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
-
-	fi->ifi_stat[slot].lfi_total =
-		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
-	fi->ifi_stat[slot].lfi_free =
-		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
-		le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
-
-bail:
-	if (unlock)
-		ocfs2_inode_unlock(inode_alloc, 0);
-
-	if (inode_alloc)
-		mutex_unlock(&inode_alloc->i_mutex);
-
-	brelse(bh);
-
-	return status;
-}
-
-int ocfs2_info_handle_freeinode(struct inode *inode,
-				struct ocfs2_info_request __user *req)
-{
-	u32 i;
-	u64 blkno = -1;
-	char namebuf[40];
-	int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
-	struct ocfs2_info_freeinode *oifi = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *inode_alloc = NULL;
-
-	oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
-	if (!oifi) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-
-	if (o2info_from_user(*oifi, req))
-		goto bail;
-
-	oifi->ifi_slotnum = osb->max_slots;
-
-	for (i = 0; i < oifi->ifi_slotnum; i++) {
-		if (o2info_coherent(&oifi->ifi_req)) {
-			inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
-			if (!inode_alloc) {
-				mlog(ML_ERROR, "unable to get alloc inode in "
-				    "slot %u\n", i);
-				status = -EIO;
-				goto bail;
-			}
-		} else {
-			ocfs2_sprintf_system_inode_name(namebuf,
-							sizeof(namebuf),
-							type, i);
-			status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
-							    namebuf,
-							    strlen(namebuf),
-							    &blkno);
-			if (status < 0) {
-				status = -ENOENT;
-				goto bail;
-			}
-		}
-
-		status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
-		if (status < 0)
-			goto bail;
-
-		iput(inode_alloc);
-		inode_alloc = NULL;
-	}
-
-	o2info_set_request_filled(&oifi->ifi_req);
-
-	if (o2info_to_user(*oifi, req))
-		goto bail;
-
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oifi->ifi_req, req);
-
-	kfree(oifi);
-
-	return status;
-}
-
-static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
-				   unsigned int chunksize)
-{
-	int index;
-
-	index = __ilog2_u32(chunksize);
-	if (index >= OCFS2_INFO_MAX_HIST)
-		index = OCFS2_INFO_MAX_HIST - 1;
-
-	hist->fc_chunks[index]++;
-	hist->fc_clusters[index] += chunksize;
-}
-
-static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
-			       unsigned int chunksize)
-{
-	if (chunksize > stats->ffs_max)
-		stats->ffs_max = chunksize;
-
-	if (chunksize < stats->ffs_min)
-		stats->ffs_min = chunksize;
-
-	stats->ffs_avg += chunksize;
-	stats->ffs_free_chunks_real++;
-}
-
-void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
-			   unsigned int chunksize)
-{
-	o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
-	o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
-}
-
-int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
-				   struct inode *gb_inode,
-				   struct ocfs2_dinode *gb_dinode,
-				   struct ocfs2_chain_rec *rec,
-				   struct ocfs2_info_freefrag *ffg,
-				   u32 chunks_in_group)
-{
-	int status = 0, used;
-	u64 blkno;
-
-	struct buffer_head *bh = NULL;
-	struct ocfs2_group_desc *bg = NULL;
-
-	unsigned int max_bits, num_clusters;
-	unsigned int offset = 0, cluster, chunk;
-	unsigned int chunk_free, last_chunksize = 0;
-
-	if (!le32_to_cpu(rec->c_free))
-		goto bail;
-
-	do {
-		if (!bg)
-			blkno = le64_to_cpu(rec->c_blkno);
-		else
-			blkno = le64_to_cpu(bg->bg_next_group);
-
-		if (bh) {
-			brelse(bh);
-			bh = NULL;
-		}
-
-		if (o2info_coherent(&ffg->iff_req))
-			status = ocfs2_read_group_descriptor(gb_inode,
-							     gb_dinode,
-							     blkno, &bh);
-		else
-			status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
-
-		if (status < 0) {
-			mlog(ML_ERROR, "Can't read the group descriptor # "
-			     "%llu from device.", (unsigned long long)blkno);
-			status = -EIO;
-			goto bail;
-		}
-
-		bg = (struct ocfs2_group_desc *)bh->b_data;
-
-		if (!le16_to_cpu(bg->bg_free_bits_count))
-			continue;
-
-		max_bits = le16_to_cpu(bg->bg_bits);
-		offset = 0;
-
-		for (chunk = 0; chunk < chunks_in_group; chunk++) {
-			/*
-			 * last chunk may be not an entire one.
-			 */
-			if ((offset + ffg->iff_chunksize) > max_bits)
-				num_clusters = max_bits - offset;
-			else
-				num_clusters = ffg->iff_chunksize;
-
-			chunk_free = 0;
-			for (cluster = 0; cluster < num_clusters; cluster++) {
-				used = ocfs2_test_bit(offset,
-						(unsigned long *)bg->bg_bitmap);
-				/*
-				 * - chunk_free counts free clusters in #N chunk.
-				 * - last_chunksize records the size(in) clusters
-				 *   for the last real free chunk being counted.
-				 */
-				if (!used) {
-					last_chunksize++;
-					chunk_free++;
-				}
-
-				if (used && last_chunksize) {
-					ocfs2_info_update_ffg(ffg,
-							      last_chunksize);
-					last_chunksize = 0;
-				}
-
-				offset++;
-			}
-
-			if (chunk_free == ffg->iff_chunksize)
-				ffg->iff_ffs.ffs_free_chunks++;
-		}
-
-		/*
-		 * need to update the info for last free chunk.
-		 */
-		if (last_chunksize)
-			ocfs2_info_update_ffg(ffg, last_chunksize);
-
-	} while (le64_to_cpu(bg->bg_next_group));
-
-bail:
-	brelse(bh);
-
-	return status;
-}
-
-int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
-				    struct inode *gb_inode, u64 blkno,
-				    struct ocfs2_info_freefrag *ffg)
-{
-	u32 chunks_in_group;
-	int status = 0, unlock = 0, i;
-
-	struct buffer_head *bh = NULL;
-	struct ocfs2_chain_list *cl = NULL;
-	struct ocfs2_chain_rec *rec = NULL;
-	struct ocfs2_dinode *gb_dinode = NULL;
-
-	if (gb_inode)
-		mutex_lock(&gb_inode->i_mutex);
-
-	if (o2info_coherent(&ffg->iff_req)) {
-		status = ocfs2_inode_lock(gb_inode, &bh, 0);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-		unlock = 1;
-	} else {
-		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-	}
-
-	gb_dinode = (struct ocfs2_dinode *)bh->b_data;
-	cl = &(gb_dinode->id2.i_chain);
-
-	/*
-	 * Chunksize(in) clusters from userspace should be
-	 * less than clusters in a group.
-	 */
-	if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
-		status = -EINVAL;
-		goto bail;
-	}
-
-	memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
-
-	ffg->iff_ffs.ffs_min = ~0U;
-	ffg->iff_ffs.ffs_clusters =
-			le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
-	ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
-			le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
-
-	chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
-
-	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
-		rec = &(cl->cl_recs[i]);
-		status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
-							gb_dinode,
-							rec, ffg,
-							chunks_in_group);
-		if (status)
-			goto bail;
-	}
-
-	if (ffg->iff_ffs.ffs_free_chunks_real)
-		ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
-					ffg->iff_ffs.ffs_free_chunks_real);
-bail:
-	if (unlock)
-		ocfs2_inode_unlock(gb_inode, 0);
-
-	if (gb_inode)
-		mutex_unlock(&gb_inode->i_mutex);
-
-	if (gb_inode)
-		iput(gb_inode);
-
-	brelse(bh);
-
-	return status;
-}
-
-int ocfs2_info_handle_freefrag(struct inode *inode,
-			       struct ocfs2_info_request __user *req)
-{
-	u64 blkno = -1;
-	char namebuf[40];
-	int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
-
-	struct ocfs2_info_freefrag *oiff;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *gb_inode = NULL;
-
-	oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
-	if (!oiff) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-
-	if (o2info_from_user(*oiff, req))
-		goto bail;
-	/*
-	 * chunksize from userspace should be power of 2.
-	 */
-	if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
-	    (!oiff->iff_chunksize)) {
-		status = -EINVAL;
-		goto bail;
-	}
-
-	if (o2info_coherent(&oiff->iff_req)) {
-		gb_inode = ocfs2_get_system_file_inode(osb, type,
-						       OCFS2_INVALID_SLOT);
-		if (!gb_inode) {
-			mlog(ML_ERROR, "unable to get global_bitmap inode\n");
-			status = -EIO;
-			goto bail;
-		}
-	} else {
-		ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
-						OCFS2_INVALID_SLOT);
-		status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
-						    namebuf,
-						    strlen(namebuf),
-						    &blkno);
-		if (status < 0) {
-			status = -ENOENT;
-			goto bail;
-		}
-	}
-
-	status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
-	if (status < 0)
-		goto bail;
-
-	o2info_set_request_filled(&oiff->iff_req);
-
-	if (o2info_to_user(*oiff, req))
-		goto bail;
-
-	status = 0;
-bail:
-	if (status)
-		o2info_set_request_error(&oiff->iff_req, req);
-
-	kfree(oiff);
+		o2info_set_request_error(oij, req);
 
 	return status;
 }
@@ -729,7 +327,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
 	if (o2info_from_user(oir, req))
 		goto bail;
 
-	o2info_clear_request_filled(&oir);
+	o2info_clear_request_filled(oir);
 
 	if (o2info_to_user(oir, req))
 		goto bail;
@@ -737,7 +335,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(&oir, req);
+		o2info_set_request_error(oir, req);
 
 	return status;
 }
@@ -791,14 +389,6 @@ int ocfs2_info_handle_request(struct inode *inode,
 		if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
 			status = ocfs2_info_handle_journal_size(inode, req);
 		break;
-	case OCFS2_INFO_FREEINODE:
-		if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
-			status = ocfs2_info_handle_freeinode(inode, req);
-		break;
-	case OCFS2_INFO_FREEFRAG:
-		if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
-			status = ocfs2_info_handle_freefrag(inode, req);
-		break;
 	default:
 		status = ocfs2_info_handle_unknown(inode, req);
 		break;
@@ -952,31 +542,6 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_info_handle(inode, &info, 0);
-	case FITRIM:
-	{
-		struct super_block *sb = inode->i_sb;
-		struct fstrim_range range;
-		int ret = 0;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&range, (struct fstrim_range *)arg,
-		    sizeof(range)))
-			return -EFAULT;
-
-		ret = ocfs2_trim_fs(sb, &range);
-		if (ret < 0)
-			return ret;
-
-		if (copy_to_user((struct fstrim_range *)arg, &range,
-		    sizeof(range)))
-			return -EFAULT;
-
-		return 0;
-	}
-	case OCFS2_IOC_MOVE_EXT:
-		return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
@@ -1004,7 +569,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
-	case FITRIM:
 		break;
 	case OCFS2_IOC_REFLINK:
 		if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -1020,8 +584,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_info_handle(inode, &info, 1);
-	case OCFS2_IOC_MOVE_EXT:
-		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/trunk/fs/ocfs2/move_extents.c b/trunk/fs/ocfs2/move_extents.c
deleted file mode 100644
index 4c5488468c14..000000000000
--- a/trunk/fs/ocfs2/move_extents.c
+++ /dev/null
@@ -1,1153 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * move_extents.c
- *
- * Copyright (C) 2011 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/mount.h>
-#include <linux/swap.h>
-
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-#include "ocfs2_ioctl.h"
-
-#include "alloc.h"
-#include "aops.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "inode.h"
-#include "journal.h"
-#include "suballoc.h"
-#include "uptodate.h"
-#include "super.h"
-#include "dir.h"
-#include "buffer_head_io.h"
-#include "sysfile.h"
-#include "suballoc.h"
-#include "refcounttree.h"
-#include "move_extents.h"
-
-struct ocfs2_move_extents_context {
-	struct inode *inode;
-	struct file *file;
-	int auto_defrag;
-	int partial;
-	int credits;
-	u32 new_phys_cpos;
-	u32 clusters_moved;
-	u64 refcount_loc;
-	struct ocfs2_move_extents *range;
-	struct ocfs2_extent_tree et;
-	struct ocfs2_alloc_context *meta_ac;
-	struct ocfs2_alloc_context *data_ac;
-	struct ocfs2_cached_dealloc_ctxt dealloc;
-};
-
-static int __ocfs2_move_extent(handle_t *handle,
-			       struct ocfs2_move_extents_context *context,
-			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
-			       int ext_flags)
-{
-	int ret = 0, index;
-	struct inode *inode = context->inode;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_extent_rec *rec, replace_rec;
-	struct ocfs2_path *path = NULL;
-	struct ocfs2_extent_list *el;
-	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
-	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
-
-	ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
-					       p_cpos, new_p_cpos, len);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	memset(&replace_rec, 0, sizeof(replace_rec));
-	replace_rec.e_cpos = cpu_to_le32(cpos);
-	replace_rec.e_leaf_clusters = cpu_to_le16(len);
-	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
-								   new_p_cpos));
-
-	path = ocfs2_new_path_from_et(&context->et);
-	if (!path) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	el = path_leaf_el(path);
-
-	index = ocfs2_search_extent_list(el, cpos);
-	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has an extent at cpos %u which can no "
-			    "longer be found.\n",
-			    (unsigned long long)ino, cpos);
-		ret = -EROFS;
-		goto out;
-	}
-
-	rec = &el->l_recs[index];
-
-	BUG_ON(ext_flags != rec->e_flags);
-	/*
-	 * after moving/defraging to new location, the extent is not going
-	 * to be refcounted anymore.
-	 */
-	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
-
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
-				      context->et.et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_split_extent(handle, &context->et, path, index,
-				 &replace_rec, context->meta_ac,
-				 &context->dealloc);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ocfs2_journal_dirty(handle, context->et.et_root_bh);
-
-	context->new_phys_cpos = new_p_cpos;
-
-	/*
-	 * need I to append truncate log for old clusters?
-	 */
-	if (old_blkno) {
-		if (ext_flags & OCFS2_EXT_REFCOUNTED)
-			ret = ocfs2_decrease_refcount(inode, handle,
-					ocfs2_blocks_to_clusters(osb->sb,
-								 old_blkno),
-					len, context->meta_ac,
-					&context->dealloc, 1);
-		else
-			ret = ocfs2_truncate_log_append(osb, handle,
-							old_blkno, len);
-	}
-
-out:
-	return ret;
-}
-
-/*
- * lock allocators, and reserving appropriate number of bits for
- * meta blocks and data clusters.
- *
- * in some cases, we don't need to reserve clusters, just let data_ac
- * be NULL.
- */
-static int ocfs2_lock_allocators_move_extents(struct inode *inode,
-					struct ocfs2_extent_tree *et,
-					u32 clusters_to_move,
-					u32 extents_to_split,
-					struct ocfs2_alloc_context **meta_ac,
-					struct ocfs2_alloc_context **data_ac,
-					int extra_blocks,
-					int *credits)
-{
-	int ret, num_free_extents;
-	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	num_free_extents = ocfs2_num_free_extents(osb, et);
-	if (num_free_extents < 0) {
-		ret = num_free_extents;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
-		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
-
-	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	if (data_ac) {
-		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
-					      clusters_to_move + 2);
-
-	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
-	     extra_blocks, clusters_to_move, *credits);
-out:
-	if (ret) {
-		if (*meta_ac) {
-			ocfs2_free_alloc_context(*meta_ac);
-			*meta_ac = NULL;
-		}
-	}
-
-	return ret;
-}
-
-/*
- * Using one journal handle to guarantee the data consistency in case
- * crash happens anywhere.
- *
- *  XXX: defrag can end up with finishing partial extent as requested,
- * due to not enough contiguous clusters can be found in allocator.
- */
-static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
-			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
-{
-	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
-	handle_t *handle;
-	struct inode *inode = context->inode;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
-	struct ocfs2_refcount_tree *ref_tree = NULL;
-	u32 new_phys_cpos, new_len;
-	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-
-	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
-
-		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-			 OCFS2_HAS_REFCOUNT_FL));
-
-		BUG_ON(!context->refcount_loc);
-
-		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
-					       &ref_tree, NULL);
-		if (ret) {
-			mlog_errno(ret);
-			return ret;
-		}
-
-		ret = ocfs2_prepare_refcount_change_for_del(inode,
-							context->refcount_loc,
-							phys_blkno,
-							*len,
-							&credits,
-							&extra_blocks);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
-						 &context->meta_ac,
-						 &context->data_ac,
-						 extra_blocks, &credits);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * should be using allocation reservation strategy there?
-	 *
-	 * if (context->data_ac)
-	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
-	 */
-
-	mutex_lock(&tl_inode->i_mutex);
-
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		ret = __ocfs2_flush_truncate_log(osb);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out_unlock_mutex;
-		}
-	}
-
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out_unlock_mutex;
-	}
-
-	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
-				     &new_phys_cpos, &new_len);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	/*
-	 * allowing partial extent moving is kind of 'pros and cons', it makes
-	 * whole defragmentation less likely to fail, on the contrary, the bad
-	 * thing is it may make the fs even more fragmented after moving, let
-	 * userspace make a good decision here.
-	 */
-	if (new_len != *len) {
-		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
-		if (!partial) {
-			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
-			ret = -ENOSPC;
-			goto out_commit;
-		}
-	}
-
-	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
-	     phys_cpos, new_phys_cpos);
-
-	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
-				  new_phys_cpos, ext_flags);
-	if (ret)
-		mlog_errno(ret);
-
-	if (partial && (new_len != *len))
-		*len = new_len;
-
-	/*
-	 * Here we should write the new page out first if we are
-	 * in write-back mode.
-	 */
-	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
-	if (ret)
-		mlog_errno(ret);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-
-out_unlock_mutex:
-	mutex_unlock(&tl_inode->i_mutex);
-
-	if (context->data_ac) {
-		ocfs2_free_alloc_context(context->data_ac);
-		context->data_ac = NULL;
-	}
-
-	if (context->meta_ac) {
-		ocfs2_free_alloc_context(context->meta_ac);
-		context->meta_ac = NULL;
-	}
-
-out:
-	if (ref_tree)
-		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-
-	return ret;
-}
-
-/*
- * find the victim alloc group, where #blkno fits.
- */
-static int ocfs2_find_victim_alloc_group(struct inode *inode,
-					 u64 vict_blkno,
-					 int type, int slot,
-					 int *vict_bit,
-					 struct buffer_head **ret_bh)
-{
-	int ret, i, blocks_per_unit = 1;
-	u64 blkno;
-	char namebuf[40];
-
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
-	struct ocfs2_chain_list *cl;
-	struct ocfs2_chain_rec *rec;
-	struct ocfs2_dinode *ac_dinode;
-	struct ocfs2_group_desc *bg;
-
-	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
-	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
-					 strlen(namebuf), &blkno);
-	if (ret) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
-	cl = &(ac_dinode->id2.i_chain);
-	rec = &(cl->cl_recs[0]);
-
-	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
-		blocks_per_unit <<= (osb->s_clustersize_bits -
-						inode->i_sb->s_blocksize_bits);
-	/*
-	 * 'vict_blkno' was out of the valid range.
-	 */
-	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
-	    (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
-				blocks_per_unit))) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
-
-		rec = &(cl->cl_recs[i]);
-		if (!rec)
-			continue;
-
-		bg = NULL;
-
-		do {
-			if (!bg)
-				blkno = le64_to_cpu(rec->c_blkno);
-			else
-				blkno = le64_to_cpu(bg->bg_next_group);
-
-			if (gd_bh) {
-				brelse(gd_bh);
-				gd_bh = NULL;
-			}
-
-			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
-			if (ret) {
-				mlog_errno(ret);
-				goto out;
-			}
-
-			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
-
-			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
-						le16_to_cpu(bg->bg_bits))) {
-
-				*ret_bh = gd_bh;
-				*vict_bit = (vict_blkno - blkno) /
-							blocks_per_unit;
-				mlog(0, "find the victim group: #%llu, "
-				     "total_bits: %u, vict_bit: %u\n",
-				     blkno, le16_to_cpu(bg->bg_bits),
-				     *vict_bit);
-				goto out;
-			}
-
-		} while (le64_to_cpu(bg->bg_next_group));
-	}
-
-	ret = -EINVAL;
-out:
-	brelse(ac_bh);
-
-	/*
-	 * caller has to release the gd_bh properly.
-	 */
-	return ret;
-}
-
-/*
- * XXX: helper to validate and adjust moving goal.
- */
-static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
-					       struct ocfs2_move_extents *range)
-{
-	int ret, goal_bit = 0;
-
-	struct buffer_head *gd_bh = NULL;
-	struct ocfs2_group_desc *bg;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int c_to_b = 1 << (osb->s_clustersize_bits -
-					inode->i_sb->s_blocksize_bits);
-
-	/*
-	 * validate goal sits within global_bitmap, and return the victim
-	 * group desc
-	 */
-	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
-					    GLOBAL_BITMAP_SYSTEM_INODE,
-					    OCFS2_INVALID_SLOT,
-					    &goal_bit, &gd_bh);
-	if (ret)
-		goto out;
-
-	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
-
-	/*
-	 * make goal become cluster aligned.
-	 */
-	if (range->me_goal % c_to_b)
-		range->me_goal = range->me_goal / c_to_b * c_to_b;
-
-	/*
-	 * moving goal is not allowd to start with a group desc blok(#0 blk)
-	 * let's compromise to the latter cluster.
-	 */
-	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
-		range->me_goal += c_to_b;
-
-	/*
-	 * movement is not gonna cross two groups.
-	 */
-	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
-								range->me_len) {
-		ret = -EINVAL;
-		goto out;
-	}
-	/*
-	 * more exact validations/adjustments will be performed later during
-	 * moving operation for each extent range.
-	 */
-	mlog(0, "extents get ready to be moved to #%llu block\n",
-	     range->me_goal);
-
-out:
-	brelse(gd_bh);
-
-	return ret;
-}
-
-static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
-				    int *goal_bit, u32 move_len, u32 max_hop,
-				    u32 *phys_cpos)
-{
-	int i, used, last_free_bits = 0, base_bit = *goal_bit;
-	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
-	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
-						 le64_to_cpu(gd->bg_blkno));
-
-	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
-
-		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
-		if (used) {
-			/*
-			 * we even tried searching the free chunk by jumping
-			 * a 'max_hop' distance, but still failed.
-			 */
-			if ((i - base_bit) > max_hop) {
-				*phys_cpos = 0;
-				break;
-			}
-
-			if (last_free_bits)
-				last_free_bits = 0;
-
-			continue;
-		} else
-			last_free_bits++;
-
-		if (last_free_bits == move_len) {
-			*goal_bit = i;
-			*phys_cpos = base_cpos + i;
-			break;
-		}
-	}
-
-	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
-}
-
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
-				       handle_t *handle,
-				       struct buffer_head *di_bh,
-				       u32 num_bits,
-				       u16 chain)
-{
-	int ret;
-	u32 tmp_used;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-	struct ocfs2_chain_list *cl =
-				(struct ocfs2_chain_list *) &di->id2.i_chain;
-
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
-	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
-	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-	ocfs2_journal_dirty(handle, di_bh);
-
-out:
-	return ret;
-}
-
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-					     struct inode *alloc_inode,
-					     struct ocfs2_group_desc *bg,
-					     struct buffer_head *group_bh,
-					     unsigned int bit_off,
-					     unsigned int num_bits)
-{
-	int status;
-	void *bitmap = bg->bg_bitmap;
-	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-
-	/* All callers get the descriptor via
-	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-
-	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
-	     num_bits);
-
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
-	status = ocfs2_journal_access_gd(handle,
-					 INODE_CACHE(alloc_inode),
-					 group_bh,
-					 journal_type);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-			    " count %u but claims %u are freed. num_bits %d",
-			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
-			    le16_to_cpu(bg->bg_bits),
-			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
-		return -EROFS;
-	}
-	while (num_bits--)
-		ocfs2_set_bit(bit_off++, bitmap);
-
-	ocfs2_journal_dirty(handle, group_bh);
-
-bail:
-	return status;
-}
-
-static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
-			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
-			     u32 len, int ext_flags)
-{
-	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
-	handle_t *handle;
-	struct inode *inode = context->inode;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
-	struct inode *gb_inode = NULL;
-	struct buffer_head *gb_bh = NULL;
-	struct buffer_head *gd_bh = NULL;
-	struct ocfs2_group_desc *gd;
-	struct ocfs2_refcount_tree *ref_tree = NULL;
-	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
-						    context->range->me_threshold);
-	u64 phys_blkno, new_phys_blkno;
-
-	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-
-	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
-
-		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-			 OCFS2_HAS_REFCOUNT_FL));
-
-		BUG_ON(!context->refcount_loc);
-
-		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
-					       &ref_tree, NULL);
-		if (ret) {
-			mlog_errno(ret);
-			return ret;
-		}
-
-		ret = ocfs2_prepare_refcount_change_for_del(inode,
-							context->refcount_loc,
-							phys_blkno,
-							len,
-							&credits,
-							&extra_blocks);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
-						 &context->meta_ac,
-						 NULL, extra_blocks, &credits);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * need to count 2 extra credits for global_bitmap inode and
-	 * group descriptor.
-	 */
-	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
-
-	/*
-	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
-	 * logic, while we still need to lock the global_bitmap.
-	 */
-	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
-					       OCFS2_INVALID_SLOT);
-	if (!gb_inode) {
-		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
-		ret = -EIO;
-		goto out;
-	}
-
-	mutex_lock(&gb_inode->i_mutex);
-
-	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_unlock_gb_mutex;
-	}
-
-	mutex_lock(&tl_inode->i_mutex);
-
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out_unlock_tl_inode;
-	}
-
-	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
-	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
-					    GLOBAL_BITMAP_SYSTEM_INODE,
-					    OCFS2_INVALID_SLOT,
-					    &goal_bit, &gd_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	/*
-	 * probe the victim cluster group to find a proper
-	 * region to fit wanted movement, it even will perfrom
-	 * a best-effort attempt by compromising to a threshold
-	 * around the goal.
-	 */
-	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
-				new_phys_cpos);
-	if (!new_phys_cpos) {
-		ret = -ENOSPC;
-		goto out_commit;
-	}
-
-	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
-				  *new_phys_cpos, ext_flags);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
-	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
-					       le16_to_cpu(gd->bg_chain));
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
-					 goal_bit, len);
-	if (ret)
-		mlog_errno(ret);
-
-	/*
-	 * Here we should write the new page out first if we are
-	 * in write-back mode.
-	 */
-	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
-	if (ret)
-		mlog_errno(ret);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-	brelse(gd_bh);
-
-out_unlock_tl_inode:
-	mutex_unlock(&tl_inode->i_mutex);
-
-	ocfs2_inode_unlock(gb_inode, 1);
-out_unlock_gb_mutex:
-	mutex_unlock(&gb_inode->i_mutex);
-	brelse(gb_bh);
-	iput(gb_inode);
-
-out:
-	if (context->meta_ac) {
-		ocfs2_free_alloc_context(context->meta_ac);
-		context->meta_ac = NULL;
-	}
-
-	if (ref_tree)
-		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-
-	return ret;
-}
-
-/*
- * Helper to calculate the defraging length in one run according to threshold.
- */
-static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
-					 u32 threshold, int *skip)
-{
-	if ((*alloc_size + *len_defraged) < threshold) {
-		/*
-		 * proceed defragmentation until we meet the thresh
-		 */
-		*len_defraged += *alloc_size;
-	} else if (*len_defraged == 0) {
-		/*
-		 * XXX: skip a large extent.
-		 */
-		*skip = 1;
-	} else {
-		/*
-		 * split this extent to coalesce with former pieces as
-		 * to reach the threshold.
-		 *
-		 * we're done here with one cycle of defragmentation
-		 * in a size of 'thresh', resetting 'len_defraged'
-		 * forces a new defragmentation.
-		 */
-		*alloc_size = threshold - *len_defraged;
-		*len_defraged = 0;
-	}
-}
-
-static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
-				struct ocfs2_move_extents_context *context)
-{
-	int ret = 0, flags, do_defrag, skip = 0;
-	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
-	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
-
-	struct inode *inode = context->inode;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_move_extents *range = context->range;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if ((inode->i_size == 0) || (range->me_len == 0))
-		return 0;
-
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		return 0;
-
-	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
-
-	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
-	ocfs2_init_dealloc_ctxt(&context->dealloc);
-
-	/*
-	 * TO-DO XXX:
-	 *
-	 * - xattr extents.
-	 */
-
-	do_defrag = context->auto_defrag;
-
-	/*
-	 * extents moving happens in unit of clusters, for the sake
-	 * of simplicity, we may ignore two clusters where 'byte_start'
-	 * and 'byte_start + len' were within.
-	 */
-	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
-	len_to_move = (range->me_start + range->me_len) >>
-						osb->s_clustersize_bits;
-	if (len_to_move >= move_start)
-		len_to_move -= move_start;
-	else
-		len_to_move = 0;
-
-	if (do_defrag) {
-		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
-		if (defrag_thresh <= 1)
-			goto done;
-	} else
-		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
-							 range->me_goal);
-
-	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
-	     "thresh: %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     (unsigned long long)range->me_start,
-	     (unsigned long long)range->me_len,
-	     move_start, len_to_move, defrag_thresh);
-
-	cpos = move_start;
-	while (len_to_move) {
-		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
-					 &flags);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		if (alloc_size > len_to_move)
-			alloc_size = len_to_move;
-
-		/*
-		 * XXX: how to deal with a hole:
-		 *
-		 * - skip the hole of course
-		 * - force a new defragmentation
-		 */
-		if (!phys_cpos) {
-			if (do_defrag)
-				len_defraged = 0;
-
-			goto next;
-		}
-
-		if (do_defrag) {
-			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
-						     defrag_thresh, &skip);
-			/*
-			 * skip large extents
-			 */
-			if (skip) {
-				skip = 0;
-				goto next;
-			}
-
-			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
-			     "alloc_size: %u, len_defraged: %u\n",
-			     cpos, phys_cpos, alloc_size, len_defraged);
-
-			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
-						  &alloc_size, flags);
-		} else {
-			ret = ocfs2_move_extent(context, cpos, phys_cpos,
-						&new_phys_cpos, alloc_size,
-						flags);
-
-			new_phys_cpos += alloc_size;
-		}
-
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		context->clusters_moved += alloc_size;
-next:
-		cpos += alloc_size;
-		len_to_move -= alloc_size;
-	}
-
-done:
-	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
-
-out:
-	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
-						      context->clusters_moved);
-	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
-						       context->new_phys_cpos);
-
-	ocfs2_schedule_truncate_log_flush(osb, 1);
-	ocfs2_run_deallocs(osb, &context->dealloc);
-
-	return ret;
-}
-
-static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
-{
-	int status;
-	handle_t *handle;
-	struct inode *inode = context->inode;
-	struct ocfs2_dinode *di;
-	struct buffer_head *di_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if (!inode)
-		return -ENOENT;
-
-	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
-		return -EROFS;
-
-	mutex_lock(&inode->i_mutex);
-
-	/*
-	 * This prevents concurrent writes from other nodes
-	 */
-	status = ocfs2_rw_lock(inode, 1);
-	if (status) {
-		mlog_errno(status);
-		goto out;
-	}
-
-	status = ocfs2_inode_lock(inode, &di_bh, 1);
-	if (status) {
-		mlog_errno(status);
-		goto out_rw_unlock;
-	}
-
-	/*
-	 * rememer ip_xattr_sem also needs to be held if necessary
-	 */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
-	status = __ocfs2_move_extents_range(di_bh, context);
-
-	up_write(&OCFS2_I(inode)->ip_alloc_sem);
-	if (status) {
-		mlog_errno(status);
-		goto out_inode_unlock;
-	}
-
-	/*
-	 * We update ctime for these changes
-	 */
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out_inode_unlock;
-	}
-
-	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status) {
-		mlog_errno(status);
-		goto out_commit;
-	}
-
-	di = (struct ocfs2_dinode *)di_bh->b_data;
-	inode->i_ctime = CURRENT_TIME;
-	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
-	ocfs2_journal_dirty(handle, di_bh);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-
-out_inode_unlock:
-	brelse(di_bh);
-	ocfs2_inode_unlock(inode, 1);
-out_rw_unlock:
-	ocfs2_rw_unlock(inode, 1);
-out:
-	mutex_unlock(&inode->i_mutex);
-
-	return status;
-}
-
-int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
-{
-	int status;
-
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct ocfs2_move_extents range;
-	struct ocfs2_move_extents_context *context = NULL;
-
-	status = mnt_want_write(filp->f_path.mnt);
-	if (status)
-		return status;
-
-	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
-		goto out;
-
-	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
-		status = -EPERM;
-		goto out;
-	}
-
-	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
-	if (!context) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto out;
-	}
-
-	context->inode = inode;
-	context->file = filp;
-
-	if (argp) {
-		if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
-				   sizeof(range))) {
-			status = -EFAULT;
-			goto out;
-		}
-	} else {
-		status = -EINVAL;
-		goto out;
-	}
-
-	if (range.me_start > i_size_read(inode))
-		goto out;
-
-	if (range.me_start + range.me_len > i_size_read(inode))
-			range.me_len = i_size_read(inode) - range.me_start;
-
-	context->range = &range;
-
-	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
-		context->auto_defrag = 1;
-		/*
-		 * ok, the default theshold for the defragmentation
-		 * is 1M, since our maximum clustersize was 1M also.
-		 * any thought?
-		 */
-		if (!range.me_threshold)
-			range.me_threshold = 1024 * 1024;
-
-		if (range.me_threshold > i_size_read(inode))
-			range.me_threshold = i_size_read(inode);
-
-		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
-			context->partial = 1;
-	} else {
-		/*
-		 * first best-effort attempt to validate and adjust the goal
-		 * (physical address in block), while it can't guarantee later
-		 * operation can succeed all the time since global_bitmap may
-		 * change a bit over time.
-		 */
-
-		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
-		if (status)
-			goto out;
-	}
-
-	status = ocfs2_move_extents(context);
-	if (status)
-		mlog_errno(status);
-out:
-	/*
-	 * movement/defragmentation may end up being partially completed,
-	 * that's the reason why we need to return userspace the finished
-	 * length and new_offset even if failure happens somewhere.
-	 */
-	if (argp) {
-		if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
-				sizeof(range)))
-			status = -EFAULT;
-	}
-
-	kfree(context);
-
-	mnt_drop_write(filp->f_path.mnt);
-
-	return status;
-}
diff --git a/trunk/fs/ocfs2/move_extents.h b/trunk/fs/ocfs2/move_extents.h
deleted file mode 100644
index 4e143e811441..000000000000
--- a/trunk/fs/ocfs2/move_extents.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * move_extents.h
- *
- * Copyright (C) 2011 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef OCFS2_MOVE_EXTENTS_H
-#define OCFS2_MOVE_EXTENTS_H
-
-int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
-
-#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/trunk/fs/ocfs2/ocfs2_ioctl.h b/trunk/fs/ocfs2/ocfs2_ioctl.h
index 5b27ff1fa577..b46f39bf7438 100644
--- a/trunk/fs/ocfs2/ocfs2_ioctl.h
+++ b/trunk/fs/ocfs2/ocfs2_ioctl.h
@@ -142,38 +142,6 @@ struct ocfs2_info_journal_size {
 	__u64 ij_journal_size;
 };
 
-struct ocfs2_info_freeinode {
-	struct ocfs2_info_request ifi_req;
-	struct ocfs2_info_local_freeinode {
-		__u64 lfi_total;
-		__u64 lfi_free;
-	} ifi_stat[OCFS2_MAX_SLOTS];
-	__u32 ifi_slotnum; /* out */
-	__u32 ifi_pad;
-};
-
-#define OCFS2_INFO_MAX_HIST     (32)
-
-struct ocfs2_info_freefrag {
-	struct ocfs2_info_request iff_req;
-	struct ocfs2_info_freefrag_stats { /* (out) */
-		struct ocfs2_info_free_chunk_list {
-			__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
-			__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
-		} ffs_fc_hist;
-		__u32 ffs_clusters;
-		__u32 ffs_free_clusters;
-		__u32 ffs_free_chunks;
-		__u32 ffs_free_chunks_real;
-		__u32 ffs_min; /* Minimum free chunksize in clusters */
-		__u32 ffs_max;
-		__u32 ffs_avg;
-		__u32 ffs_pad;
-	} iff_ffs;
-	__u32 iff_chunksize; /* chunksize in clusters(in) */
-	__u32 iff_pad;
-};
-
 /* Codes for ocfs2_info_request */
 enum ocfs2_info_type {
 	OCFS2_INFO_CLUSTERSIZE = 1,
@@ -183,8 +151,6 @@ enum ocfs2_info_type {
 	OCFS2_INFO_UUID,
 	OCFS2_INFO_FS_FEATURES,
 	OCFS2_INFO_JOURNAL_SIZE,
-	OCFS2_INFO_FREEINODE,
-	OCFS2_INFO_FREEFRAG,
 	OCFS2_INFO_NUM_TYPES
 };
 
@@ -205,38 +171,4 @@ enum ocfs2_info_type {
 
 #define OCFS2_IOC_INFO		_IOR('o', 5, struct ocfs2_info)
 
-struct ocfs2_move_extents {
-/* All values are in bytes */
-	/* in */
-	__u64 me_start;		/* Virtual start in the file to move */
-	__u64 me_len;		/* Length of the extents to be moved */
-	__u64 me_goal;		/* Physical offset of the goal,
-				   it's in block unit */
-	__u64 me_threshold;	/* Maximum distance from goal or threshold
-				   for auto defragmentation */
-	__u64 me_flags;		/* Flags for the operation:
-				 * - auto defragmentation.
-				 * - refcount,xattr cases.
-				 */
-	/* out */
-	__u64 me_moved_len;	/* Moved/defraged length */
-	__u64 me_new_offset;	/* Resulting physical location */
-	__u32 me_reserved[2];	/* Reserved for futhure */
-};
-
-#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG	(0x00000001)	/* Kernel manages to
-							   claim new clusters
-							   as the goal place
-							   for extents moving */
-#define OCFS2_MOVE_EXT_FL_PART_DEFRAG	(0x00000002)	/* Allow partial extent
-							   moving, is to make
-							   movement less likely
-							   to fail, may make fs
-							   even more fragmented */
-#define OCFS2_MOVE_EXT_FL_COMPLETE	(0x00000004)	/* Move or defragmenation
-							   completely gets done.
-							 */
-
-#define OCFS2_IOC_MOVE_EXT	_IOW('o', 6, struct ocfs2_move_extents)
-
 #endif /* OCFS2_IOCTL_H */
diff --git a/trunk/fs/ocfs2/ocfs2_trace.h b/trunk/fs/ocfs2/ocfs2_trace.h
index 3b481f490633..a1dae5bb54ac 100644
--- a/trunk/fs/ocfs2/ocfs2_trace.h
+++ b/trunk/fs/ocfs2/ocfs2_trace.h
@@ -688,31 +688,6 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
 		  __entry->blkno, __entry->bit)
 );
 
-TRACE_EVENT(ocfs2_trim_extent,
-	TP_PROTO(struct super_block *sb, unsigned long long blk,
-		 unsigned long long count),
-	TP_ARGS(sb, blk, count),
-	TP_STRUCT__entry(
-		__field(int, dev_major)
-		__field(int, dev_minor)
-		__field(unsigned long long, blk)
-		__field(__u64,	count)
-	),
-	TP_fast_assign(
-		__entry->dev_major = MAJOR(sb->s_dev);
-		__entry->dev_minor = MINOR(sb->s_dev);
-		__entry->blk = blk;
-		__entry->count = count;
-	),
-	TP_printk("%d %d %llu %llu",
-		  __entry->dev_major, __entry->dev_minor,
-		  __entry->blk, __entry->count)
-);
-
-DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
-
-DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
-
 /* End of trace events for fs/ocfs2/alloc.c. */
 
 /* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/trunk/fs/ocfs2/refcounttree.c b/trunk/fs/ocfs2/refcounttree.c
index ebfd3825f12a..3c7606cff1ab 100644
--- a/trunk/fs/ocfs2/refcounttree.c
+++ b/trunk/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
 			    u32 *num_clusters,
 			    unsigned int *extent_flags);
 	int (*cow_duplicate_clusters)(handle_t *handle,
-				      struct file *file,
+				      struct ocfs2_cow_context *context,
 				      u32 cpos, u32 old_cluster,
 				      u32 new_cluster, u32 new_len);
 };
@@ -2921,21 +2921,20 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-				     struct file *file,
-				     u32 cpos, u32 old_cluster,
-				     u32 new_cluster, u32 new_len)
+static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+					    struct ocfs2_cow_context *context,
+					    u32 cpos, u32 old_cluster,
+					    u32 new_cluster, u32 new_len)
 {
 	int ret = 0, partial;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
 	struct page *page;
 	pgoff_t page_index;
 	unsigned int from, to, readahead_pages;
 	loff_t offset, end, map_end;
-	struct address_space *mapping = inode->i_mapping;
+	struct address_space *mapping = context->inode->i_mapping;
 
 	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
 					       new_cluster, new_len);
@@ -2949,8 +2948,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	 * We only duplicate pages until we reach the page contains i_size - 1.
 	 * So trim 'end' to i_size.
 	 */
-	if (end > i_size_read(inode))
-		end = i_size_read(inode);
+	if (end > i_size_read(context->inode))
+		end = i_size_read(context->inode);
 
 	while (offset < end) {
 		page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2973,9 +2972,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
 			BUG_ON(PageDirty(page));
 
-		if (PageReadahead(page)) {
+		if (PageReadahead(page) && context->file) {
 			page_cache_async_readahead(mapping,
-						   &file->f_ra, file,
+						   &context->file->f_ra,
+						   context->file,
 						   page, page_index,
 						   readahead_pages);
 		}
@@ -2999,7 +2999,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 			}
 		}
 
-		ocfs2_map_and_dirty_page(inode, handle, from, to,
+		ocfs2_map_and_dirty_page(context->inode,
+					 handle, from, to,
 					 page, 0, &new_block);
 		mark_page_accessed(page);
 unlock:
@@ -3014,15 +3015,14 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-				    struct file *file,
-				    u32 cpos, u32 old_cluster,
-				    u32 new_cluster, u32 new_len)
+static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+					   struct ocfs2_cow_context *context,
+					   u32 cpos, u32 old_cluster,
+					   u32 new_cluster, u32 new_len)
 {
 	int ret = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+	struct super_block *sb = context->inode->i_sb;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
 	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
 
 	/*If the old clusters is unwritten, no need to duplicate. */
 	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-		ret = context->cow_duplicate_clusters(handle, context->file,
-						      cpos, old, new, len);
+		ret = context->cow_duplicate_clusters(handle, context, cpos,
+						      old, new, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3162,22 +3162,22 @@ static int ocfs2_replace_clusters(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_cow_sync_writeback(struct super_block *sb,
-			     struct inode *inode,
-			     u32 cpos, u32 num_clusters)
+static int ocfs2_cow_sync_writeback(struct super_block *sb,
+				    struct ocfs2_cow_context *context,
+				    u32 cpos, u32 num_clusters)
 {
 	int ret = 0;
 	loff_t offset, end, map_end;
 	pgoff_t page_index;
 	struct page *page;
 
-	if (ocfs2_should_order_data(inode))
+	if (ocfs2_should_order_data(context->inode))
 		return 0;
 
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
 
-	ret = filemap_fdatawrite_range(inode->i_mapping,
+	ret = filemap_fdatawrite_range(context->inode->i_mapping,
 				       offset, end - 1);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -3190,7 +3190,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
 		if (map_end > end)
 			map_end = end;
 
-		page = find_or_create_page(inode->i_mapping,
+		page = find_or_create_page(context->inode->i_mapping,
 					   page_index, GFP_NOFS);
 		BUG_ON(!page);
 
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	 * in write-back mode.
 	 */
 	if (context->get_clusters == ocfs2_di_get_clusters) {
-		ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
+		ret = ocfs2_cow_sync_writeback(sb, context, cpos,
 					       orig_num_clusters);
 		if (ret)
 			mlog_errno(ret);
diff --git a/trunk/fs/ocfs2/refcounttree.h b/trunk/fs/ocfs2/refcounttree.h
index 7754608c83a4..c8ce46f7d8e3 100644
--- a/trunk/fs/ocfs2/refcounttree.h
+++ b/trunk/fs/ocfs2/refcounttree.h
@@ -84,17 +84,6 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
 			     struct buffer_head *ref_root_bh,
 			     u32 cpos, u32 write_len,
 			     struct ocfs2_post_refcount *post);
-int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-				     struct file *file,
-				     u32 cpos, u32 old_cluster,
-				     u32 new_cluster, u32 new_len);
-int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-				    struct file *file,
-				    u32 cpos, u32 old_cluster,
-				    u32 new_cluster, u32 new_len);
-int ocfs2_cow_sync_writeback(struct super_block *sb,
-			     struct inode *inode,
-			     u32 cpos, u32 num_clusters);
 int ocfs2_add_refcount_flag(struct inode *inode,
 			    struct ocfs2_extent_tree *data_et,
 			    struct ocfs2_caching_info *ref_ci,
diff --git a/trunk/fs/ocfs2/super.c b/trunk/fs/ocfs2/super.c
index cdbaf5e97308..5a521c748859 100644
--- a/trunk/fs/ocfs2/super.c
+++ b/trunk/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#include <linux/cleancache.h>
 
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
@@ -1567,7 +1566,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
 		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
 
-	if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
+	if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
 		seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 
 	if (osb->osb_commit_interval)
@@ -2353,7 +2352,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		mlog_errno(status);
 		goto bail;
 	}
-	cleancache_init_shared_fs((char *)&uuid_net_key, sb);
 
 bail:
 	return status;
diff --git a/trunk/fs/omfs/dir.c b/trunk/fs/omfs/dir.c
index c368360c35a1..de4ff29f1e05 100644
--- a/trunk/fs/omfs/dir.c
+++ b/trunk/fs/omfs/dir.c
@@ -240,12 +240,8 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int ret;
 
-
-	if (S_ISDIR(inode->i_mode)) {
-		dentry_unhash(dentry);
-		if (!omfs_dir_is_empty(inode))
-			return -ENOTEMPTY;
-	}
+	if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
+		return -ENOTEMPTY;
 
 	ret = omfs_delete_entry(dentry);
 	if (ret)
@@ -382,9 +378,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err;
 
 	if (new_inode) {
-		if (S_ISDIR(new_inode->i_mode))
-			dentry_unhash(new_dentry);
-
 		/* overwriting existing file/dir */
 		err = omfs_remove(new_dir, new_dentry);
 		if (err)
diff --git a/trunk/fs/proc/Makefile b/trunk/fs/proc/Makefile
index c1c729335924..df434c5f28fb 100644
--- a/trunk/fs/proc/Makefile
+++ b/trunk/fs/proc/Makefile
@@ -20,7 +20,6 @@ proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
 proc-y	+= softirqs.o
-proc-y	+= namespaces.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c
index dc8bca72b002..dfa532730e55 100644
--- a/trunk/fs/proc/base.c
+++ b/trunk/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
 	return allowed;
 }
 
-int proc_setattr(struct dentry *dentry, struct iattr *attr)
+static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int error;
 	struct inode *inode = dentry->d_inode;
@@ -1736,7 +1736,8 @@ static int task_dumpable(struct task_struct *task)
 	return 0;
 }
 
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+
+static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
@@ -1778,7 +1779,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t
 	return NULL;
 }
 
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task;
@@ -1819,7 +1820,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
  * made this apply to all per process world readable and executable
  * directories.
  */
-int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode;
 	struct task_struct *task;
@@ -1861,7 +1862,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
 	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
-const struct dentry_operations pid_dentry_operations =
+static const struct dentry_operations pid_dentry_operations =
 {
 	.d_revalidate	= pid_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -1869,6 +1870,9 @@ const struct dentry_operations pid_dentry_operations =
 
 /* Lookups */
 
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+				struct task_struct *, const void *);
+
 /*
  * Fill a directory entry.
  *
@@ -1881,8 +1885,8 @@ const struct dentry_operations pid_dentry_operations =
  * reported by readdir in sync with the inode numbers reported
  * by stat.
  */
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	const char *name, int len,
+static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	char *name, int len,
 	instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
 	struct dentry *child, *dir = filp->f_path.dentry;
@@ -2816,7 +2820,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
-	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
 	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
@@ -3165,7 +3168,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 static const struct pid_entry tid_base_stuff[] = {
 	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
 	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
-	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 	REG("environ",   S_IRUSR, proc_environ_operations),
 	INF("auxv",      S_IRUSR, proc_pid_auxv),
 	ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/trunk/fs/proc/inode.c b/trunk/fs/proc/inode.c
index 74b48cfa1bb2..d15aa1b1cc8f 100644
--- a/trunk/fs/proc/inode.c
+++ b/trunk/fs/proc/inode.c
@@ -28,7 +28,6 @@ static void proc_evict_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
-	const struct proc_ns_operations *ns_ops;
 
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
@@ -45,10 +44,6 @@ static void proc_evict_inode(struct inode *inode)
 		rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
 		sysctl_head_put(head);
 	}
-	/* Release any associated namespace */
-	ns_ops = PROC_I(inode)->ns_ops;
-	if (ns_ops && ns_ops->put)
-		ns_ops->put(PROC_I(inode)->ns);
 }
 
 static struct kmem_cache * proc_inode_cachep;
@@ -67,8 +62,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei->pde = NULL;
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
-	ei->ns = NULL;
-	ei->ns_ops = NULL;
 	inode = &ei->vfs_inode;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	return inode;
diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h
index 7838e5cfec14..3763b436e69d 100644
--- a/trunk/fs/proc/internal.h
+++ b/trunk/fs/proc/internal.h
@@ -127,21 +127,3 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
  */
 int proc_readdir(struct file *, void *, filldir_t);
 struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
-
-
-
-/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
-				struct task_struct *, const void *);
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	const char *name, int len,
-	instantiate_t instantiate, struct task_struct *task, const void *ptr);
-int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
-extern const struct dentry_operations pid_dentry_operations;
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-int proc_setattr(struct dentry *dentry, struct iattr *attr);
-
-extern const struct inode_operations proc_ns_dir_inode_operations;
-extern const struct file_operations proc_ns_dir_operations;
-
diff --git a/trunk/fs/proc/namespaces.c b/trunk/fs/proc/namespaces.c
deleted file mode 100644
index 781dec5bd682..000000000000
--- a/trunk/fs/proc/namespaces.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <linux/proc_fs.h>
-#include <linux/nsproxy.h>
-#include <linux/sched.h>
-#include <linux/ptrace.h>
-#include <linux/fs_struct.h>
-#include <linux/mount.h>
-#include <linux/path.h>
-#include <linux/namei.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <net/net_namespace.h>
-#include <linux/mnt_namespace.h>
-#include <linux/ipc_namespace.h>
-#include <linux/pid_namespace.h>
-#include "internal.h"
-
-
-static const struct proc_ns_operations *ns_entries[] = {
-#ifdef CONFIG_NET_NS
-	&netns_operations,
-#endif
-#ifdef CONFIG_UTS_NS
-	&utsns_operations,
-#endif
-#ifdef CONFIG_IPC_NS
-	&ipcns_operations,
-#endif
-};
-
-static const struct file_operations ns_file_operations = {
-	.llseek		= no_llseek,
-};
-
-static struct dentry *proc_ns_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	const struct proc_ns_operations *ns_ops = ptr;
-	struct inode *inode;
-	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
-
-	inode = proc_pid_make_inode(dir->i_sb, task);
-	if (!inode)
-		goto out;
-
-	ei = PROC_I(inode);
-	inode->i_mode = S_IFREG|S_IRUSR;
-	inode->i_fop  = &ns_file_operations;
-	ei->ns_ops    = ns_ops;
-	ei->ns	      = ns_ops->get(task);
-	if (!ei->ns)
-		goto out_iput;
-
-	dentry->d_op = &pid_dentry_operations;
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
-		error = NULL;
-out:
-	return error;
-out_iput:
-	iput(inode);
-	goto out;
-}
-
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task,
-	const struct proc_ns_operations *ops)
-{
-	return proc_fill_cache(filp, dirent, filldir,
-				ops->name, strlen(ops->name),
-				proc_ns_instantiate, task, ops);
-}
-
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
-				filldir_t filldir)
-{
-	int i;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	const struct proc_ns_operations **entry, **last;
-	ino_t ino;
-	int ret;
-
-	ret = -ENOENT;
-	if (!task)
-		goto out_no_task;
-
-	ret = -EPERM;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
-	ret = 0;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		i -= 2;
-		if (i >= ARRAY_SIZE(ns_entries)) {
-			ret = 1;
-			goto out;
-		}
-		entry = ns_entries + i;
-		last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-		while (entry <= last) {
-			if (proc_ns_fill_cache(filp, dirent, filldir,
-						task, *entry) < 0)
-				goto out;
-			filp->f_pos++;
-			entry++;
-		}
-	}
-
-	ret = 1;
-out:
-	put_task_struct(task);
-out_no_task:
-	return ret;
-}
-
-const struct file_operations proc_ns_dir_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_ns_dir_readdir,
-};
-
-static struct dentry *proc_ns_dir_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
-{
-	struct dentry *error;
-	struct task_struct *task = get_proc_task(dir);
-	const struct proc_ns_operations **entry, **last;
-	unsigned int len = dentry->d_name.len;
-
-	error = ERR_PTR(-ENOENT);
-
-	if (!task)
-		goto out_no_task;
-
-	error = ERR_PTR(-EPERM);
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
-	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-	for (entry = ns_entries; entry <= last; entry++) {
-		if (strlen((*entry)->name) != len)
-			continue;
-		if (!memcmp(dentry->d_name.name, (*entry)->name, len))
-			break;
-	}
-	error = ERR_PTR(-ENOENT);
-	if (entry > last)
-		goto out;
-
-	error = proc_ns_instantiate(dir, dentry, task, *entry);
-out:
-	put_task_struct(task);
-out_no_task:
-	return error;
-}
-
-const struct inode_operations proc_ns_dir_inode_operations = {
-	.lookup		= proc_ns_dir_lookup,
-	.getattr	= pid_getattr,
-	.setattr	= proc_setattr,
-};
-
-struct file *proc_ns_fget(int fd)
-{
-	struct file *file;
-
-	file = fget(fd);
-	if (!file)
-		return ERR_PTR(-EBADF);
-
-	if (file->f_op != &ns_file_operations)
-		goto out_invalid;
-
-	return file;
-
-out_invalid:
-	fput(file);
-	return ERR_PTR(-EINVAL);
-}
-
diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c
index db15935fa757..2c9db29ea358 100644
--- a/trunk/fs/proc/task_mmu.c
+++ b/trunk/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
-	vm_flags_t flags = vma->vm_flags;
+	int flags = vma->vm_flags;
 	unsigned long ino = 0;
 	unsigned long long pgoff = 0;
 	unsigned long start, end;
diff --git a/trunk/fs/reiserfs/namei.c b/trunk/fs/reiserfs/namei.c
index 76c8164d5651..118662690cdf 100644
--- a/trunk/fs/reiserfs/namei.c
+++ b/trunk/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 	INITIALIZE_PATH(path);
 	struct reiserfs_dir_entry de;
 
-	dentry_unhash(dentry);
-
 	/* we will be doing 2 balancings and update 2 stat data, we change quotas
 	 * of the owner of the directory and of the owner of the parent directory.
 	 * The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	unsigned long savelink = 1;
 	struct timespec ctime;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/* three balancings: (1) old name removal, (2) new name insertion
 	   and (3) maybe "save" link insertion
 	   stat data updates: (1) old directory,
diff --git a/trunk/fs/reiserfs/xattr.c b/trunk/fs/reiserfs/xattr.c
index 50f1abccd1cd..47d2a4498b03 100644
--- a/trunk/fs/reiserfs/xattr.c
+++ b/trunk/fs/reiserfs/xattr.c
@@ -105,6 +105,7 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	if (!error)
 		d_delete(dentry);
+	dput(dentry);
 
 	return error;
 }
diff --git a/trunk/fs/super.c b/trunk/fs/super.c
index c75593953c52..c04f7e0b7ed2 100644
--- a/trunk/fs/super.c
+++ b/trunk/fs/super.c
@@ -31,7 +31,6 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
-#include <linux/cleancache.h>
 #include "internal.h"
 
 
@@ -113,7 +112,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
-		s->cleancache_poolid = -1;
 	}
 out:
 	return s;
@@ -179,7 +177,6 @@ void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
-		cleancache_flush_fs(s);
 		fs->kill_sb(s);
 		/*
 		 * We need to call rcu_barrier so all the delayed rcu free
diff --git a/trunk/fs/sysv/namei.c b/trunk/fs/sysv/namei.c
index e2cc6756f3b1..e474fbcf8bde 100644
--- a/trunk/fs/sysv/namei.c
+++ b/trunk/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	int err = -ENOTEMPTY;
 
-	dentry_unhash(dentry);
-
 	if (sysv_empty_dir(inode)) {
 		err = sysv_unlink(dir, dentry);
 		if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 	struct sysv_dir_entry * old_de;
 	int err = -ENOENT;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_de = sysv_find_entry(old_dentry, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/trunk/fs/ubifs/dir.c b/trunk/fs/ubifs/dir.c
index c2b80943560d..ef5abd38f0bf 100644
--- a/trunk/fs/ubifs/dir.c
+++ b/trunk/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
 
-	dentry_unhash(dentry);
-
 	/*
 	 * Budget request settings: deletion direntry, deletion inode and
 	 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec time;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/*
 	 * Budget request settings: deletion direntry, new direntry, removing
 	 * the old inode, and changing old and new parent directory inodes.
diff --git a/trunk/fs/udf/namei.c b/trunk/fs/udf/namei.c
index 4d76594c2a8f..f1dce848ef96 100644
--- a/trunk/fs/udf/namei.c
+++ b/trunk/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fileIdentDesc *fi, cfi;
 	struct kernel_lb_addr tloc;
 
-	dentry_unhash(dentry);
-
 	retval = -ENOENT;
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct kernel_lb_addr tloc;
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	if (ofi) {
 		if (ofibh.sbh != ofibh.ebh)
diff --git a/trunk/fs/ufs/namei.c b/trunk/fs/ufs/namei.c
index 953ebdfc5bf7..29309e25417f 100644
--- a/trunk/fs/ufs/namei.c
+++ b/trunk/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err= -ENOTEMPTY;
 
-	dentry_unhash(dentry);
-
 	lock_ufs(dir->i_sb);
 	if (ufs_empty_dir (inode)) {
 		err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/trunk/fs/xfs/linux-2.6/xfs_discard.c b/trunk/fs/xfs/linux-2.6/xfs_discard.c
index 244e797dae32..d61611c88012 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_discard.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,32 +191,3 @@ xfs_ioc_trim(
 		return -XFS_ERROR(EFAULT);
 	return 0;
 }
-
-int
-xfs_discard_extents(
-	struct xfs_mount	*mp,
-	struct list_head	*list)
-{
-	struct xfs_busy_extent	*busyp;
-	int			error = 0;
-
-	list_for_each_entry(busyp, list, list) {
-		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
-					 busyp->length);
-
-		error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
-				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
-				XFS_FSB_TO_BB(mp, busyp->length),
-				GFP_NOFS, 0);
-		if (error && error != EOPNOTSUPP) {
-			xfs_info(mp,
-	 "discard failed for extent [0x%llu,%u], error %d",
-				 (unsigned long long)busyp->bno,
-				 busyp->length,
-				 error);
-			return error;
-		}
-	}
-
-	return 0;
-}
diff --git a/trunk/fs/xfs/linux-2.6/xfs_discard.h b/trunk/fs/xfs/linux-2.6/xfs_discard.h
index 344879aea646..e82b6dd3e127 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_discard.h
+++ b/trunk/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,9 +2,7 @@
 #define XFS_DISCARD_H 1
 
 struct fstrim_range;
-struct list_head;
 
 extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
-extern int	xfs_discard_extents(struct xfs_mount *, struct list_head *);
 
 #endif /* XFS_DISCARD_H */
diff --git a/trunk/fs/xfs/linux-2.6/xfs_super.c b/trunk/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf1..b0aa59e51fd0 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_super.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_super.c
@@ -110,10 +110,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
-#define MNTOPT_DELAYLOG    "delaylog"	/* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG  "nodelaylog"	/* Delayed logging disabled */
-#define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
-#define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
+#define MNTOPT_DELAYLOG   "delaylog"	/* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog"	/* Delayed loging disabled */
 
 /*
  * Table driven mount option parser.
@@ -357,10 +355,6 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
 			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
-		} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
-			mp->m_flags |= XFS_MOUNT_DISCARD;
-		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
-			mp->m_flags &= ~XFS_MOUNT_DISCARD;
 		} else if (!strcmp(this_char, "ihashsize")) {
 			xfs_warn(mp,
 	"ihashsize no longer used, option is deprecated.");
@@ -394,13 +388,6 @@ xfs_parseargs(
 		return EINVAL;
 	}
 
-	if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
-	    !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
-		xfs_warn(mp,
-	"the discard option is incompatible with the nodelaylog option");
-		return EINVAL;
-	}
-
 #ifndef CONFIG_XFS_QUOTA
 	if (XFS_IS_QUOTA_RUNNING(mp)) {
 		xfs_warn(mp, "quota support not available in this kernel.");
@@ -501,7 +488,6 @@ xfs_showargs(
 		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
 		{ XFS_MOUNT_DELAYLOG,		"," MNTOPT_DELAYLOG },
-		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
 		{ 0, NULL }
 	};
 	static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/trunk/fs/xfs/xfs_ag.h b/trunk/fs/xfs/xfs_ag.h
index 6530769a999b..da0a561ffba2 100644
--- a/trunk/fs/xfs/xfs_ag.h
+++ b/trunk/fs/xfs/xfs_ag.h
@@ -187,9 +187,6 @@ struct xfs_busy_extent {
 	xfs_agnumber_t	agno;
 	xfs_agblock_t	bno;
 	xfs_extlen_t	length;
-	unsigned int	flags;
-#define XFS_ALLOC_BUSY_DISCARDED	0x01	/* undergoing a discard op. */
-#define XFS_ALLOC_BUSY_SKIP_DISCARD	0x02	/* do not discard */
 };
 
 /*
diff --git a/trunk/fs/xfs/xfs_alloc.c b/trunk/fs/xfs/xfs_alloc.c
index 95862bbff56b..acdced86413c 100644
--- a/trunk/fs/xfs/xfs_alloc.c
+++ b/trunk/fs/xfs/xfs_alloc.c
@@ -2469,7 +2469,7 @@ xfs_free_extent(
 
 	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 	if (!error)
-		xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
+		xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
 error0:
 	xfs_perag_put(args.pag);
 	return error;
@@ -2480,8 +2480,7 @@ xfs_alloc_busy_insert(
 	struct xfs_trans	*tp,
 	xfs_agnumber_t		agno,
 	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	unsigned int		flags)
+	xfs_extlen_t		len)
 {
 	struct xfs_busy_extent	*new;
 	struct xfs_busy_extent	*busyp;
@@ -2505,7 +2504,6 @@ xfs_alloc_busy_insert(
 	new->bno = bno;
 	new->length = len;
 	INIT_LIST_HEAD(&new->list);
-	new->flags = flags;
 
 	/* trace before insert to be able to see failed inserts */
 	trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
@@ -2610,18 +2608,6 @@ xfs_alloc_busy_update_extent(
 	xfs_agblock_t		bbno = busyp->bno;
 	xfs_agblock_t		bend = bbno + busyp->length;
 
-	/*
-	 * This extent is currently being discarded.  Give the thread
-	 * performing the discard a chance to mark the extent unbusy
-	 * and retry.
-	 */
-	if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
-		spin_unlock(&pag->pagb_lock);
-		delay(1);
-		spin_lock(&pag->pagb_lock);
-		return false;
-	}
-
 	/*
 	 * If there is a busy extent overlapping a user allocation, we have
 	 * no choice but to force the log and retry the search.
@@ -2827,8 +2813,7 @@ xfs_alloc_busy_trim(
 		 * If this is a metadata allocation, try to reuse the busy
 		 * extent instead of trimming the allocation.
 		 */
-		if (!args->userdata &&
-		    !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
+		if (!args->userdata) {
 			if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
 							  busyp, fbno, flen,
 							  false))
@@ -2994,16 +2979,10 @@ xfs_alloc_busy_clear_one(
 	kmem_free(busyp);
 }
 
-/*
- * Remove all extents on the passed in list from the busy extents tree.
- * If do_discard is set skip extents that need to be discarded, and mark
- * these as undergoing a discard operation instead.
- */
 void
 xfs_alloc_busy_clear(
 	struct xfs_mount	*mp,
-	struct list_head	*list,
-	bool			do_discard)
+	struct list_head	*list)
 {
 	struct xfs_busy_extent	*busyp, *n;
 	struct xfs_perag	*pag = NULL;
@@ -3020,11 +2999,7 @@ xfs_alloc_busy_clear(
 			agno = busyp->agno;
 		}
 
-		if (do_discard && busyp->length &&
-		    !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
-			busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
-		else
-			xfs_alloc_busy_clear_one(mp, pag, busyp);
+		xfs_alloc_busy_clear_one(mp, pag, busyp);
 	}
 
 	if (pag) {
diff --git a/trunk/fs/xfs/xfs_alloc.h b/trunk/fs/xfs/xfs_alloc.h
index 2f52b924be79..240ad288f2f9 100644
--- a/trunk/fs/xfs/xfs_alloc.h
+++ b/trunk/fs/xfs/xfs_alloc.h
@@ -137,11 +137,10 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
 xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-	xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+	xfs_agblock_t bno, xfs_extlen_t len);
 
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
-	bool do_discard);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
 
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/trunk/fs/xfs/xfs_alloc_btree.c b/trunk/fs/xfs/xfs_alloc_btree.c
index 2b3518826a69..8b469d53599f 100644
--- a/trunk/fs/xfs/xfs_alloc_btree.c
+++ b/trunk/fs/xfs/xfs_alloc_btree.c
@@ -120,8 +120,7 @@ xfs_allocbt_free_block(
 	if (error)
 		return error;
 
-	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
-			      XFS_ALLOC_BUSY_SKIP_DISCARD);
+	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
 	return 0;
 }
diff --git a/trunk/fs/xfs/xfs_bmap.c b/trunk/fs/xfs/xfs_bmap.c
index e546a33214c9..fa00788de2f5 100644
--- a/trunk/fs/xfs/xfs_bmap.c
+++ b/trunk/fs/xfs/xfs_bmap.c
@@ -88,6 +88,22 @@ xfs_bmap_add_attrfork_local(
 	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
 	int			*flags);	/* inode logging flags */
 
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
 /*
  * Called by xfs_bmap_add_extent to handle cases converting a delayed
  * allocation to a real allocation.
@@ -95,13 +111,14 @@ xfs_bmap_add_attrfork_local(
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp); /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
  * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -110,9 +127,10 @@ xfs_bmap_add_extent_delay_real(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
+	int			*logflagsp,/* inode logging flags */
+	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
  * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -121,7 +139,7 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
@@ -134,7 +152,7 @@ xfs_bmap_add_extent_hole_real(
 STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp); /* inode logging flags */
@@ -161,6 +179,22 @@ xfs_bmap_btree_to_extents(
 	int			*logflagsp, /* inode logging flags */
 	int			whichfork); /* data or attr fork */
 
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current trans pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp,/* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd);	 /* OK to allocate reserved blocks */
+
 /*
  * Remove the entry "free" from the free item list.  Prev points to the
  * previous entry, unless "free" is the head of the list.
@@ -440,13 +474,14 @@ xfs_bmap_add_attrfork_local(
 STATIC int				/* error */
 xfs_bmap_add_extent(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
+	int			whichfork, /* data or attr fork */
+	int			rsvd)	/* OK to use reserved data blocks */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor or null */
 	xfs_filblks_t		da_new; /* new count del alloc blocks used */
@@ -457,27 +492,23 @@ xfs_bmap_add_extent(
 	xfs_extnum_t		nextents; /* number of extents in file now */
 
 	XFS_STATS_INC(xs_add_exlist);
-
 	cur = *curp;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(idx <= nextents);
 	da_old = da_new = 0;
 	error = 0;
-
-	ASSERT(*idx >= 0);
-	ASSERT(*idx <= nextents);
-
 	/*
 	 * This is the first extent added to a new/empty file.
 	 * Special case this one, so other routines get to assume there are
 	 * already extents in the list.
 	 */
 	if (nextents == 0) {
-		xfs_iext_insert(ip, *idx, 1, new,
+		xfs_iext_insert(ip, 0, 1, new,
 				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
 
 		ASSERT(cur == NULL);
-
+		ifp->if_lastex = 0;
 		if (!isnullstartblock(new->br_startblock)) {
 			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
 			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -491,25 +522,27 @@ xfs_bmap_add_extent(
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
-		error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-						       &logflags);
+		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+				&logflags, rsvd)))
+			goto done;
 	}
 	/*
 	 * Real allocation off the end of the file.
 	 */
-	else if (*idx == nextents) {
+	else if (idx == nextents) {
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
-		error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, whichfork);
+		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+				&logflags, whichfork)))
+			goto done;
 	} else {
 		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
 
 		/*
 		 * Get the record referred to by idx.
 		 */
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
 		/*
 		 * If it's a real allocation record, and the new allocation ends
 		 * after the start of the referred to record, then we're filling
@@ -524,18 +557,22 @@ xfs_bmap_add_extent(
 				if (cur)
 					ASSERT(cur->bc_private.b.flags &
 						XFS_BTCUR_BPRV_WASDEL);
-				error = xfs_bmap_add_extent_delay_real(ip,
-						idx, &cur, new, &da_new,
-						first, flist, &logflags);
+				if ((error = xfs_bmap_add_extent_delay_real(ip,
+					idx, &cur, new, &da_new, first, flist,
+					&logflags, rsvd)))
+					goto done;
+			} else if (new->br_state == XFS_EXT_NORM) {
+				ASSERT(new->br_state == XFS_EXT_NORM);
+				if ((error = xfs_bmap_add_extent_unwritten_real(
+					ip, idx, &cur, new, &logflags)))
+					goto done;
 			} else {
-				ASSERT(new->br_state == XFS_EXT_NORM ||
-				       new->br_state == XFS_EXT_UNWRITTEN);
-
-				error = xfs_bmap_add_extent_unwritten_real(ip,
-						idx, &cur, new, &logflags);
-				if (error)
+				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+				if ((error = xfs_bmap_add_extent_unwritten_real(
+					ip, idx, &cur, new, &logflags)))
 					goto done;
 			}
+			ASSERT(*curp == cur || *curp == NULL);
 		}
 		/*
 		 * Otherwise we're filling in a hole with an allocation.
@@ -544,15 +581,13 @@ xfs_bmap_add_extent(
 			if (cur)
 				ASSERT((cur->bc_private.b.flags &
 					XFS_BTCUR_BPRV_WASDEL) == 0);
-			error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, whichfork);
+			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+					new, &logflags, whichfork)))
+				goto done;
 		}
 	}
 
-	if (error)
-		goto done;
 	ASSERT(*curp == cur || *curp == NULL);
-
 	/*
 	 * Convert to a btree if necessary.
 	 */
@@ -580,7 +615,7 @@ xfs_bmap_add_extent(
 		ASSERT(nblks <= da_old);
 		if (nblks < da_old)
 			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-				(int64_t)(da_old - nblks), 0);
+				(int64_t)(da_old - nblks), rsvd);
 	}
 	/*
 	 * Clear out the allocated field, done with it now in any case.
@@ -605,13 +640,14 @@ xfs_bmap_add_extent(
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp) /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd)	/* OK to use reserved data block allocation */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
 	int			diff;	/* temp value */
@@ -637,7 +673,7 @@ xfs_bmap_add_extent_delay_real(
 	 */
 	cur = *curp;
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	ep = xfs_iext_get_ext(ifp, *idx);
+	ep = xfs_iext_get_ext(ifp, idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -656,9 +692,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (*idx > 0) {
+	if (idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
 
 		if (isnullstartblock(LEFT.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -676,9 +712,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
 
 		if (isnullstartblock(RIGHT.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
@@ -709,14 +745,14 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 2, state);
+		xfs_iext_remove(ip, idx, 2, state);
+		ip->i_df.if_lastex = idx - 1;
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -748,14 +784,13 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		--*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_df.if_lastex = idx - 1;
+		xfs_iext_remove(ip, idx, 1, state);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -779,13 +814,14 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_df.if_lastex = idx;
+		xfs_iext_remove(ip, idx + 1, 1, state);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -801,7 +837,6 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_blockcount, PREV.br_state)))
 				goto done;
 		}
-
 		*dnew = 0;
 		break;
 
@@ -811,10 +846,11 @@ xfs_bmap_add_extent_delay_real(
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
+		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -830,7 +866,6 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-
 		*dnew = 0;
 		break;
 
@@ -839,16 +874,17 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
+		ip->i_df.if_lastex = idx - 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -868,9 +904,7 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		--*idx;
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
 		break;
 
@@ -879,11 +913,12 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, *idx, 1, new, state);
+		xfs_iext_insert(ip, idx, 1, new, state);
+		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -911,10 +946,9 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, *idx + 1);
+		ep = xfs_iext_get_ext(ifp, idx + 1);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-
+		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
 		*dnew = temp;
 		break;
 
@@ -924,13 +958,15 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is contiguous with the new allocation.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount,
 			RIGHT.br_state);
-		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+		ip->i_df.if_lastex = idx + 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -947,14 +983,10 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_state)))
 				goto done;
 		}
-
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		++*idx;
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
 		break;
 
@@ -964,9 +996,10 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is not contiguous.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, *idx + 1, 1, new, state);
+		xfs_iext_insert(ip, idx + 1, 1, new, state);
+		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -994,11 +1027,9 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, *idx);
+		ep = xfs_iext_get_ext(ifp, idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		++*idx;
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
 		break;
 
@@ -1025,7 +1056,7 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
 		LEFT = *new;
 		RIGHT.br_state = PREV.br_state;
@@ -1034,7 +1065,8 @@ xfs_bmap_add_extent_delay_real(
 		RIGHT.br_startoff = new_endoff;
 		RIGHT.br_blockcount = temp2;
 		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-		xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
+		xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
+		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1065,7 +1097,7 @@ xfs_bmap_add_extent_delay_real(
 			(cur ? cur->bc_private.b.allocated : 0));
 		if (diff > 0 &&
 		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-					     -((int64_t)diff), 0)) {
+					     -((int64_t)diff), rsvd)) {
 			/*
 			 * Ick gross gag me with a spoon.
 			 */
@@ -1077,7 +1109,7 @@ xfs_bmap_add_extent_delay_real(
 					if (!diff ||
 					    !xfs_icsb_modify_counters(ip->i_mount,
 						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), 0))
+						    -((int64_t)diff), rsvd))
 						break;
 				}
 				if (temp2) {
@@ -1086,20 +1118,18 @@ xfs_bmap_add_extent_delay_real(
 					if (!diff ||
 					    !xfs_icsb_modify_counters(ip->i_mount,
 						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), 0))
+						    -((int64_t)diff), rsvd))
 						break;
 				}
 			}
 		}
-		ep = xfs_iext_get_ext(ifp, *idx);
+		ep = xfs_iext_get_ext(ifp, idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
 			nullstartblock((int)temp2));
-		trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
-
-		++*idx;
+		trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
 		*dnew = temp + temp2;
 		break;
 
@@ -1131,7 +1161,7 @@ xfs_bmap_add_extent_delay_real(
 STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp) /* inode logging flags */
@@ -1158,7 +1188,7 @@ xfs_bmap_add_extent_unwritten_real(
 	error = 0;
 	cur = *curp;
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	ep = xfs_iext_get_ext(ifp, *idx);
+	ep = xfs_iext_get_ext(ifp, idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	newext = new->br_state;
 	oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1181,9 +1211,9 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (*idx > 0) {
+	if (idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
 
 		if (isnullstartblock(LEFT.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -1201,9 +1231,9 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
 		if (isnullstartblock(RIGHT.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -1232,15 +1262,14 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting all of a previous oldext extent to newext.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		--*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 2, state);
+		xfs_iext_remove(ip, idx, 2, state);
+		ip->i_df.if_lastex = idx - 1;
 		ip->i_d.di_nextents -= 2;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1276,14 +1305,13 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting all of a previous oldext extent to newext.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		--*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_df.if_lastex = idx - 1;
+		xfs_iext_remove(ip, idx, 1, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1313,12 +1341,13 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting all of a previous oldext extent to newext.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
 		xfs_bmbt_set_state(ep, newext);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		ip->i_df.if_lastex = idx;
+		xfs_iext_remove(ip, idx + 1, 1, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1349,10 +1378,11 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_state(ep, newext);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
+		ip->i_df.if_lastex = idx;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -1374,22 +1404,21 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the first part of a previous oldext extent to newext.
 		 * The left neighbor is contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep,
 			new->br_startblock + new->br_blockcount);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		--*idx;
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
+		ip->i_df.if_lastex = idx - 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -1420,16 +1449,17 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the first part of a previous oldext extent to newext.
 		 * The left neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
 		xfs_bmbt_set_startblock(ep,
 			new->br_startblock + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
-		xfs_iext_insert(ip, *idx, 1, new, state);
+		xfs_iext_insert(ip, idx, 1, new, state);
+		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1458,19 +1488,17 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the last part of a previous oldext extent to newext.
 		 * The right neighbor is contiguous with the new allocation.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		++*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount, newext);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
 
+		ip->i_df.if_lastex = idx + 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -1500,14 +1528,13 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the last part of a previous oldext extent to newext.
 		 * The right neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		++*idx;
-		xfs_iext_insert(ip, *idx, 1, new, state);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
+		xfs_iext_insert(ip, idx + 1, 1, new, state);
+		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1541,10 +1568,10 @@ xfs_bmap_add_extent_unwritten_real(
 		 * newext.  Contiguity is impossible here.
 		 * One extent becomes three extents.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			new->br_startoff - PREV.br_startoff);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
 		r[0] = *new;
 		r[1].br_startoff = new_endoff;
@@ -1552,10 +1579,8 @@ xfs_bmap_add_extent_unwritten_real(
 			PREV.br_startoff + PREV.br_blockcount - new_endoff;
 		r[1].br_startblock = new->br_startblock + new->br_blockcount;
 		r[1].br_state = oldext;
-
-		++*idx;
-		xfs_iext_insert(ip, *idx, 2, &r[0], state);
-
+		xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents += 2;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1625,10 +1650,12 @@ xfs_bmap_add_extent_unwritten_real(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp) /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd)		/* OK to allocate reserved blocks */
 {
+	xfs_bmbt_rec_host_t	*ep;	/* extent record for idx */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_filblks_t		newlen=0;	/* new indirect size */
@@ -1638,15 +1665,16 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
 
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
 	ASSERT(isnullstartblock(new->br_startblock));
 
 	/*
 	 * Check and set flags if this segment has a left neighbor
 	 */
-	if (*idx > 0) {
+	if (idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
 
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -1656,9 +1684,9 @@ xfs_bmap_add_extent_hole_delay(
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+		xfs_bmbt_get_all(ep, &right);
 
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
@@ -1691,21 +1719,21 @@ xfs_bmap_add_extent_hole_delay(
 		 * on the left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		--*idx;
 		temp = left.br_blockcount + new->br_blockcount +
 			right.br_blockcount;
 
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
 			nullstartblock((int)newlen));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		xfs_iext_remove(ip, idx, 1, state);
+		ip->i_df.if_lastex = idx - 1;
 		break;
 
 	case BMAP_LEFT_CONTIG:
@@ -1714,17 +1742,17 @@ xfs_bmap_add_extent_hole_delay(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		--*idx;
 		temp = left.br_blockcount + new->br_blockcount;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
 			nullstartblock((int)newlen));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
+		ip->i_df.if_lastex = idx - 1;
 		break;
 
 	case BMAP_RIGHT_CONTIG:
@@ -1733,15 +1761,16 @@ xfs_bmap_add_extent_hole_delay(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		temp = new->br_blockcount + right.br_blockcount;
 		oldlen = startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
-			new->br_startoff,
+		xfs_bmbt_set_allf(ep, new->br_startoff,
 			nullstartblock((int)newlen), temp, right.br_state);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
+		ip->i_df.if_lastex = idx;
 		break;
 
 	case 0:
@@ -1751,13 +1780,14 @@ xfs_bmap_add_extent_hole_delay(
 		 * Insert a new entry.
 		 */
 		oldlen = newlen = 0;
-		xfs_iext_insert(ip, *idx, 1, new, state);
+		xfs_iext_insert(ip, idx, 1, new, state);
+		ip->i_df.if_lastex = idx;
 		break;
 	}
 	if (oldlen != newlen) {
 		ASSERT(oldlen > newlen);
 		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-			(int64_t)(oldlen - newlen), 0);
+			(int64_t)(oldlen - newlen), rsvd);
 		/*
 		 * Nothing to do for disk quota accounting here.
 		 */
@@ -1773,12 +1803,13 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
 	int			whichfork) /* data or attr fork */
 {
+	xfs_bmbt_rec_host_t	*ep;	/* pointer to extent entry ins. point */
 	int			error;	/* error return value */
 	int			i;	/* temp state */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
@@ -1788,7 +1819,8 @@ xfs_bmap_add_extent_hole_real(
 	int			state;	/* state bits, accessed thru macros */
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+	ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
 
 	if (whichfork == XFS_ATTR_FORK)
@@ -1797,9 +1829,9 @@ xfs_bmap_add_extent_hole_real(
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 */
-	if (*idx > 0) {
+	if (idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
 	}
@@ -1808,9 +1840,9 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+		xfs_bmbt_get_all(ep, &right);
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -1847,15 +1879,14 @@ xfs_bmap_add_extent_hole_real(
 		 * left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			left.br_blockcount + new->br_blockcount +
 			right.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
+		xfs_iext_remove(ip, idx, 1, state);
+		ifp->if_lastex = idx - 1;
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL) {
@@ -1890,12 +1921,12 @@ xfs_bmap_add_extent_hole_real(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			left.br_blockcount + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
+		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
@@ -1921,13 +1952,13 @@ xfs_bmap_add_extent_hole_real(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
-			new->br_startoff, new->br_startblock,
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
 			new->br_blockcount + right.br_blockcount,
 			right.br_state);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
+		ifp->if_lastex = idx;
 		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
@@ -1953,7 +1984,8 @@ xfs_bmap_add_extent_hole_real(
 		 * real allocation.
 		 * Insert a new entry.
 		 */
-		xfs_iext_insert(ip, *idx, 1, new, state);
+		xfs_iext_insert(ip, idx, 1, new, state);
+		ifp->if_lastex = idx;
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL) {
@@ -2801,12 +2833,13 @@ STATIC int				/* error */
 xfs_bmap_del_extent(
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_trans_t		*tp,	/* current transaction pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/delete */
+	xfs_extnum_t		idx,	/* extent number to update/delete */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
+	int			whichfork, /* data or attr fork */
+	int			rsvd)	/* OK to allocate reserved blocks */
 {
 	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
 	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
@@ -2837,10 +2870,10 @@ xfs_bmap_del_extent(
 
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+	ASSERT((idx >= 0) && (idx < ifp->if_bytes /
 		(uint)sizeof(xfs_bmbt_rec_t)));
 	ASSERT(del->br_blockcount > 0);
-	ep = xfs_iext_get_ext(ifp, *idx);
+	ep = xfs_iext_get_ext(ifp, idx);
 	xfs_bmbt_get_all(ep, &got);
 	ASSERT(got.br_startoff <= del->br_startoff);
 	del_endoff = del->br_startoff + del->br_blockcount;
@@ -2914,12 +2947,11 @@ xfs_bmap_del_extent(
 		/*
 		 * Matches the whole extent.  Delete the entry.
 		 */
-		xfs_iext_remove(ip, *idx, 1,
+		xfs_iext_remove(ip, idx, 1,
 				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-		--*idx;
+		ifp->if_lastex = idx;
 		if (delay)
 			break;
-
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		flags |= XFS_ILOG_CORE;
@@ -2936,20 +2968,21 @@ xfs_bmap_del_extent(
 		/*
 		 * Deleting the first part of the extent.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, del_endoff);
 		temp = got.br_blockcount - del->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
+		ifp->if_lastex = idx;
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 			da_new = temp;
 			break;
 		}
 		xfs_bmbt_set_startblock(ep, del_endblock);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		if (!cur) {
 			flags |= xfs_ilog_fext(whichfork);
 			break;
@@ -2965,17 +2998,18 @@ xfs_bmap_del_extent(
 		 * Deleting the last part of the extent.
 		 */
 		temp = got.br_blockcount - del->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
+		ifp->if_lastex = idx;
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 			da_new = temp;
 			break;
 		}
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		if (!cur) {
 			flags |= xfs_ilog_fext(whichfork);
 			break;
@@ -2992,7 +3026,7 @@ xfs_bmap_del_extent(
 		 * Deleting the middle of the extent.
 		 */
 		temp = del->br_startoff - got.br_startoff;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		new.br_startoff = del_endoff;
 		temp2 = got_endoff - del_endoff;
@@ -3079,9 +3113,9 @@ xfs_bmap_del_extent(
 				}
 			}
 		}
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-		++*idx;
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, idx + 1, 1, &new, state);
+		ifp->if_lastex = idx + 1;
 		break;
 	}
 	/*
@@ -3108,7 +3142,7 @@ xfs_bmap_del_extent(
 	ASSERT(da_old >= da_new);
 	if (da_old > da_new) {
 		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-			(int64_t)(da_old - da_new), 0);
+			(int64_t)(da_old - da_new), rsvd);
 	}
 done:
 	*logflagsp = flags;
@@ -4528,24 +4562,29 @@ xfs_bmapi(
 				if (rt) {
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FREXTENTS,
-							-((int64_t)extsz), 0);
+							-((int64_t)extsz), (flags &
+							XFS_BMAPI_RSVBLOCKS));
 				} else {
 					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
-							-((int64_t)alen), 0);
+							-((int64_t)alen), (flags &
+							XFS_BMAPI_RSVBLOCKS));
 				}
 				if (!error) {
 					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
-							-((int64_t)indlen), 0);
+							-((int64_t)indlen), (flags &
+							XFS_BMAPI_RSVBLOCKS));
 					if (error && rt)
 						xfs_mod_incore_sb(mp,
 							XFS_SBS_FREXTENTS,
-							(int64_t)extsz, 0);
+							(int64_t)extsz, (flags &
+							XFS_BMAPI_RSVBLOCKS));
 					else if (error)
 						xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
-							(int64_t)alen, 0);
+							(int64_t)alen, (flags &
+							XFS_BMAPI_RSVBLOCKS));
 				}
 
 				if (error) {
@@ -4662,12 +4701,13 @@ xfs_bmapi(
 				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
 					got.br_state = XFS_EXT_UNWRITTEN;
 			}
-			error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
+			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
 				firstblock, flist, &tmp_logflags,
-				whichfork);
+				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
+			lastx = ifp->if_lastex;
 			ep = xfs_iext_get_ext(ifp, lastx);
 			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 			xfs_bmbt_get_all(ep, &got);
@@ -4763,12 +4803,13 @@ xfs_bmapi(
 			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 						? XFS_EXT_NORM
 						: XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
+			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
 				firstblock, flist, &tmp_logflags,
-				whichfork);
+				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
+			lastx = ifp->if_lastex;
 			ep = xfs_iext_get_ext(ifp, lastx);
 			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 			xfs_bmbt_get_all(ep, &got);
@@ -4827,14 +4868,14 @@ xfs_bmapi(
 		/*
 		 * Else go on to the next record.
 		 */
+		ep = xfs_iext_get_ext(ifp, ++lastx);
 		prev = got;
-		if (++lastx < nextents) {
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
-		} else {
+		if (lastx >= nextents)
 			eof = 1;
-		}
+		else
+			xfs_bmbt_get_all(ep, &got);
 	}
+	ifp->if_lastex = lastx;
 	*nmap = n;
 	/*
 	 * Transform from btree to extents, give it cur.
@@ -4943,6 +4984,7 @@ xfs_bmapi_single(
 	ASSERT(!isnullstartblock(got.br_startblock));
 	ASSERT(bno < got.br_startoff + got.br_blockcount);
 	*fsb = got.br_startblock + (bno - got.br_startoff);
+	ifp->if_lastex = lastx;
 	return 0;
 }
 
@@ -4984,6 +5026,7 @@ xfs_bunmapi(
 	int			tmp_logflags;	/* partial logging flags */
 	int			wasdel;		/* was a delayed alloc extent */
 	int			whichfork;	/* data or attribute fork */
+	int			rsvd;		/* OK to allocate reserved blocks */
 	xfs_fsblock_t		sum;
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5001,7 +5044,7 @@ xfs_bunmapi(
 	mp = ip->i_mount;
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
-
+	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
 	ASSERT(ifp->if_ext_max ==
@@ -5117,9 +5160,9 @@ xfs_bunmapi(
 				del.br_blockcount = mod;
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
+			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
 				firstblock, flist, &logflags,
-				XFS_DATA_FORK);
+				XFS_DATA_FORK, 0);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5145,12 +5188,9 @@ xfs_bunmapi(
 				 */
 				ASSERT(bno >= del.br_blockcount);
 				bno -= del.br_blockcount;
-				if (got.br_startoff > bno) {
-					if (--lastx >= 0) {
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-						xfs_bmbt_get_all(ep, &got);
-					}
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(--ep, &got);
 				}
 				continue;
 			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5174,19 +5214,18 @@ xfs_bunmapi(
 					prev.br_startoff = start;
 				}
 				prev.br_state = XFS_EXT_UNWRITTEN;
-				lastx--;
-				error = xfs_bmap_add_extent(ip, &lastx, &cur,
+				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
 					&prev, firstblock, flist, &logflags,
-					XFS_DATA_FORK);
+					XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
 			} else {
 				ASSERT(del.br_state == XFS_EXT_NORM);
 				del.br_state = XFS_EXT_UNWRITTEN;
-				error = xfs_bmap_add_extent(ip, &lastx, &cur,
+				error = xfs_bmap_add_extent(ip, lastx, &cur,
 					&del, firstblock, flist, &logflags,
-					XFS_DATA_FORK);
+					XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5201,13 +5240,13 @@ xfs_bunmapi(
 				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
 				do_div(rtexts, mp->m_sb.sb_rextsize);
 				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-						(int64_t)rtexts, 0);
+						(int64_t)rtexts, rsvd);
 				(void)xfs_trans_reserve_quota_nblks(NULL,
 					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_RTBLKS);
 			} else {
 				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-						(int64_t)del.br_blockcount, 0);
+						(int64_t)del.br_blockcount, rsvd);
 				(void)xfs_trans_reserve_quota_nblks(NULL,
 					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_REGBLKS);
@@ -5238,29 +5277,31 @@ xfs_bunmapi(
 			error = XFS_ERROR(ENOSPC);
 			goto error0;
 		}
-		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
-				&tmp_logflags, whichfork);
+		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+				&tmp_logflags, whichfork, rsvd);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
 		bno = del.br_startoff - 1;
 nodelete:
+		lastx = ifp->if_lastex;
 		/*
 		 * If not done go on to the next (previous) record.
+		 * Reset ep in case the extents array was re-alloced.
 		 */
+		ep = xfs_iext_get_ext(ifp, lastx);
 		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-			if (lastx >= 0) {
-				ep = xfs_iext_get_ext(ifp, lastx);
-				if (xfs_bmbt_get_startoff(ep) > bno) {
-					if (--lastx >= 0)
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-				}
-				xfs_bmbt_get_all(ep, &got);
+			if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+			    xfs_bmbt_get_startoff(ep) > bno) {
+				if (--lastx >= 0)
+					ep = xfs_iext_get_ext(ifp, lastx);
 			}
+			if (lastx >= 0)
+				xfs_bmbt_get_all(ep, &got);
 			extno++;
 		}
 	}
+	ifp->if_lastex = lastx;
 	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/trunk/fs/xfs/xfs_bmap.h b/trunk/fs/xfs/xfs_bmap.h
index c62234bde053..3651191daea1 100644
--- a/trunk/fs/xfs/xfs_bmap.h
+++ b/trunk/fs/xfs/xfs_bmap.h
@@ -69,6 +69,7 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
+#define XFS_BMAPI_RSVBLOCKS	0x020	/* OK to alloc. reserved data blocks */
 #define	XFS_BMAPI_PREALLOC	0x040	/* preallocation op: unwritten space */
 #define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
 					/* combine contig. space */
@@ -86,6 +87,7 @@ typedef	struct xfs_bmap_free
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
+	{ XFS_BMAPI_RSVBLOCKS,	"RSVBLOCKS" }, \
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
diff --git a/trunk/fs/xfs/xfs_inode.c b/trunk/fs/xfs/xfs_inode.c
index a098a20ca63e..c8e3349c287c 100644
--- a/trunk/fs/xfs/xfs_inode.c
+++ b/trunk/fs/xfs/xfs_inode.c
@@ -920,6 +920,7 @@ xfs_iread_extents(
 	/*
 	 * We know that the size is valid (it's checked in iformat_btree)
 	 */
+	ifp->if_lastex = NULLEXTNUM;
 	ifp->if_bytes = ifp->if_real_bytes = 0;
 	ifp->if_flags |= XFS_IFEXTENTS;
 	xfs_iext_add(ifp, 0, nextents);
@@ -2557,9 +2558,12 @@ xfs_iflush_fork(
 	case XFS_DINODE_FMT_EXTENTS:
 		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
 		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
+		ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
+			(ifp->if_bytes == 0));
+		ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
+			(ifp->if_bytes > 0));
 		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
-			ASSERT(xfs_iext_get_ext(ifp, 0));
 			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
 			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
 				whichfork);
@@ -3108,8 +3112,6 @@ xfs_iext_get_ext(
 	xfs_extnum_t	idx)		/* index of target extent */
 {
 	ASSERT(idx >= 0);
-	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-
 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
 		return ifp->if_u1.if_ext_irec->er_extbuf;
 	} else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3189,6 +3191,7 @@ xfs_iext_add(
 		}
 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 		ifp->if_real_bytes = 0;
+		ifp->if_lastex = nextents + ext_diff;
 	}
 	/*
 	 * Otherwise use a linear (direct) extent list.
@@ -3883,10 +3886,8 @@ xfs_iext_idx_to_irec(
 	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	ASSERT(page_idx >= 0);
-	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
-
+	ASSERT(page_idx >= 0 && page_idx <=
+		ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp_idx = 0;
 	low = 0;
diff --git a/trunk/fs/xfs/xfs_inode.h b/trunk/fs/xfs/xfs_inode.h
index 3ae6d58e5473..ff4e2a30227d 100644
--- a/trunk/fs/xfs/xfs_inode.h
+++ b/trunk/fs/xfs/xfs_inode.h
@@ -67,6 +67,7 @@ typedef struct xfs_ifork {
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */
 	unsigned char		if_ext_max;	/* max # of extent records */
+	xfs_extnum_t		if_lastex;	/* last if_extents used */
 	union {
 		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
 		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
diff --git a/trunk/fs/xfs/xfs_log_cil.c b/trunk/fs/xfs/xfs_log_cil.c
index c7755d5a5fbe..7d56e88a3f0e 100644
--- a/trunk/fs/xfs/xfs_log_cil.c
+++ b/trunk/fs/xfs/xfs_log_cil.c
@@ -29,7 +29,6 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
-#include "xfs_discard.h"
 
 /*
  * Perform initial CIL structure initialisation. If the CIL is not
@@ -362,28 +361,18 @@ xlog_cil_committed(
 	int	abort)
 {
 	struct xfs_cil_ctx	*ctx = args;
-	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
 
 	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 					ctx->start_lsn, abort);
 
 	xfs_alloc_busy_sort(&ctx->busy_extents);
-	xfs_alloc_busy_clear(mp, &ctx->busy_extents,
-			     (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+	xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
 
 	spin_lock(&ctx->cil->xc_cil_lock);
 	list_del(&ctx->committing);
 	spin_unlock(&ctx->cil->xc_cil_lock);
 
 	xlog_cil_free_logvec(ctx->lv_chain);
-
-	if (!list_empty(&ctx->busy_extents)) {
-		ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
-
-		xfs_discard_extents(mp, &ctx->busy_extents);
-		xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
-	}
-
 	kmem_free(ctx);
 }
 
diff --git a/trunk/fs/xfs/xfs_mount.h b/trunk/fs/xfs/xfs_mount.h
index 3d68bb267c5f..19af0ab0d0c6 100644
--- a/trunk/fs/xfs/xfs_mount.h
+++ b/trunk/fs/xfs/xfs_mount.h
@@ -224,7 +224,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
 						   operations, typically for
 						   disk errors in metadata */
-#define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
 #define XFS_MOUNT_RETERR	(1ULL << 6)     /* return alignment errors to
 						   user */
 #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
diff --git a/trunk/fs/xfs/xfs_trans.c b/trunk/fs/xfs/xfs_trans.c
index 7c7bc2b786bd..d1f24858ccc4 100644
--- a/trunk/fs/xfs/xfs_trans.c
+++ b/trunk/fs/xfs/xfs_trans.c
@@ -609,7 +609,7 @@ xfs_trans_free(
 	struct xfs_trans	*tp)
 {
 	xfs_alloc_busy_sort(&tp->t_busy);
-	xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
+	xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
 
 	atomic_dec(&tp->t_mountp->m_active_trans);
 	xfs_trans_free_dqinfo(tp);
diff --git a/trunk/include/linux/buffer_head.h b/trunk/include/linux/buffer_head.h
index 503c8a6b3079..f5df23561b96 100644
--- a/trunk/include/linux/buffer_head.h
+++ b/trunk/include/linux/buffer_head.h
@@ -217,24 +217,8 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-				get_block_t get_block);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
-/* Convert errno to return value from ->page_mkwrite() call */
-static inline int block_page_mkwrite_return(int err)
-{
-	if (err == 0)
-		return VM_FAULT_LOCKED;
-	if (err == -EFAULT)
-		return VM_FAULT_NOPAGE;
-	if (err == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (err == -EAGAIN)
-		return VM_FAULT_RETRY;
-	/* -ENOSPC, -EDQUOT, -EIO ... */
-	return VM_FAULT_SIGBUS;
-}
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
diff --git a/trunk/include/linux/cleancache.h b/trunk/include/linux/cleancache.h
deleted file mode 100644
index 04ffb2e6c9d0..000000000000
--- a/trunk/include/linux/cleancache.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef _LINUX_CLEANCACHE_H
-#define _LINUX_CLEANCACHE_H
-
-#include <linux/fs.h>
-#include <linux/exportfs.h>
-#include <linux/mm.h>
-
-#define CLEANCACHE_KEY_MAX 6
-
-/*
- * cleancache requires every file with a page in cleancache to have a
- * unique key unless/until the file is removed/truncated.  For some
- * filesystems, the inode number is unique, but for "modern" filesystems
- * an exportable filehandle is required (see exportfs.h)
- */
-struct cleancache_filekey {
-	union {
-		ino_t ino;
-		__u32 fh[CLEANCACHE_KEY_MAX];
-		u32 key[CLEANCACHE_KEY_MAX];
-	} u;
-};
-
-struct cleancache_ops {
-	int (*init_fs)(size_t);
-	int (*init_shared_fs)(char *uuid, size_t);
-	int (*get_page)(int, struct cleancache_filekey,
-			pgoff_t, struct page *);
-	void (*put_page)(int, struct cleancache_filekey,
-			pgoff_t, struct page *);
-	void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
-	void (*flush_inode)(int, struct cleancache_filekey);
-	void (*flush_fs)(int);
-};
-
-extern struct cleancache_ops
-	cleancache_register_ops(struct cleancache_ops *ops);
-extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(char *, struct super_block *);
-extern int  __cleancache_get_page(struct page *);
-extern void __cleancache_put_page(struct page *);
-extern void __cleancache_flush_page(struct address_space *, struct page *);
-extern void __cleancache_flush_inode(struct address_space *);
-extern void __cleancache_flush_fs(struct super_block *);
-extern int cleancache_enabled;
-
-#ifdef CONFIG_CLEANCACHE
-static inline bool cleancache_fs_enabled(struct page *page)
-{
-	return page->mapping->host->i_sb->cleancache_poolid >= 0;
-}
-static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
-{
-	return mapping->host->i_sb->cleancache_poolid >= 0;
-}
-#else
-#define cleancache_enabled (0)
-#define cleancache_fs_enabled(_page) (0)
-#define cleancache_fs_enabled_mapping(_page) (0)
-#endif
-
-/*
- * The shim layer provided by these inline functions allows the compiler
- * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
- * is disabled, to a single global variable check if CONFIG_CLEANCACHE
- * is enabled but no cleancache "backend" has dynamically enabled it,
- * and, for the most frequent cleancache ops, to a single global variable
- * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
- * and a cleancache backend has dynamically enabled cleancache, but the
- * filesystem referenced by that cleancache op has not enabled cleancache.
- * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
- * no measurable performance impact.
- */
-
-static inline void cleancache_init_fs(struct super_block *sb)
-{
-	if (cleancache_enabled)
-		__cleancache_init_fs(sb);
-}
-
-static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
-{
-	if (cleancache_enabled)
-		__cleancache_init_shared_fs(uuid, sb);
-}
-
-static inline int cleancache_get_page(struct page *page)
-{
-	int ret = -1;
-
-	if (cleancache_enabled && cleancache_fs_enabled(page))
-		ret = __cleancache_get_page(page);
-	return ret;
-}
-
-static inline void cleancache_put_page(struct page *page)
-{
-	if (cleancache_enabled && cleancache_fs_enabled(page))
-		__cleancache_put_page(page);
-}
-
-static inline void cleancache_flush_page(struct address_space *mapping,
-					struct page *page)
-{
-	/* careful... page->mapping is NULL sometimes when this is called */
-	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
-		__cleancache_flush_page(mapping, page);
-}
-
-static inline void cleancache_flush_inode(struct address_space *mapping)
-{
-	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
-		__cleancache_flush_inode(mapping);
-}
-
-static inline void cleancache_flush_fs(struct super_block *sb)
-{
-	if (cleancache_enabled)
-		__cleancache_flush_fs(sb);
-}
-
-#endif /* _LINUX_CLEANCACHE_H */
diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h
index 241609346dfb..3f9d3251790d 100644
--- a/trunk/include/linux/fs.h
+++ b/trunk/include/linux/fs.h
@@ -1428,11 +1428,6 @@ struct super_block {
 	 */
 	char __rcu *s_options;
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
-
-	/*
-	 * Saved pool identifier for cleancache (-1 means none)
-	 */
-	int cleancache_poolid;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h
index 59225ef27d15..943c76b3d4bb 100644
--- a/trunk/include/linux/hugetlb.h
+++ b/trunk/include/linux/hugetlb.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_HUGETLB_H
 #define _LINUX_HUGETLB_H
 
-#include <linux/mm_types.h>
 #include <linux/fs.h>
 #include <linux/hugetlb_inline.h>
 
@@ -42,7 +41,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
-						vm_flags_t vm_flags);
+						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
@@ -169,7 +168,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
 				struct user_struct **user, int creat_flags);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
@@ -193,7 +192,7 @@ static inline void set_file_hugepages(struct file *file)
 #define is_file_hugepages(file)			0
 #define set_file_hugepages(file)		BUG()
 static inline struct file *hugetlb_file_setup(const char *name, size_t size,
-		vm_flags_t acctflag, struct user_struct **user, int creat_flags)
+		int acctflag, struct user_struct **user, int creat_flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
diff --git a/trunk/include/linux/hugetlb_inline.h b/trunk/include/linux/hugetlb_inline.h
index 2bb681fbeb35..6931489a5c14 100644
--- a/trunk/include/linux/hugetlb_inline.h
+++ b/trunk/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-	return !!(vma->vm_flags & VM_HUGETLB);
+	return vma->vm_flags & VM_HUGETLB;
 }
 
 #else
diff --git a/trunk/include/linux/if_link.h b/trunk/include/linux/if_link.h
index 0ee969a5593d..f4a2e6b1b864 100644
--- a/trunk/include/linux/if_link.h
+++ b/trunk/include/linux/if_link.h
@@ -136,7 +136,6 @@ enum {
 	IFLA_PORT_SELF,
 	IFLA_AF_SPEC,
 	IFLA_GROUP,		/* Group the device belongs to */
-	IFLA_NET_NS_FD,
 	__IFLA_MAX
 };
 
diff --git a/trunk/include/linux/jbd2.h b/trunk/include/linux/jbd2.h
index 4ecb7b16b278..a32dcaec04e1 100644
--- a/trunk/include/linux/jbd2.h
+++ b/trunk/include/linux/jbd2.h
@@ -529,10 +529,9 @@ struct transaction_s
 	enum {
 		T_RUNNING,
 		T_LOCKED,
+		T_RUNDOWN,
 		T_FLUSH,
 		T_COMMIT,
-		T_COMMIT_DFLUSH,
-		T_COMMIT_JFLUSH,
 		T_FINISHED
 	}			t_state;
 
@@ -659,9 +658,7 @@ struct transaction_s
 	 * waiting for it to finish.
 	 */
 	unsigned int t_synchronous_commit:1;
-
-	/* Disk flush needs to be sent to fs partition [no locking] */
-	int			t_need_data_flush;
+	unsigned int t_flushed_data_blocks:1;
 
 	/*
 	 * For use by the filesystem to store fs-specific data
@@ -1231,7 +1228,6 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
-int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
 
 void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h
index fb8e814f78dc..8eb969ebf904 100644
--- a/trunk/include/linux/mm.h
+++ b/trunk/include/linux/mm.h
@@ -165,12 +165,12 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return !!(vma->vm_flags & VM_PFN_AT_MMAP);
+	return (vma->vm_flags & VM_PFN_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
 {
-	return !!(vma->vm_flags & VM_PFNMAP);
+	return (vma->vm_flags & VM_PFNMAP);
 }
 
 /*
@@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
-	vm_flags_t vm_flags, unsigned long pgoff);
+	unsigned int vm_flags, unsigned long pgoff);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h
index 6fe96c19f85e..071d459e866b 100644
--- a/trunk/include/linux/mm_types.h
+++ b/trunk/include/linux/mm_types.h
@@ -102,8 +102,6 @@ struct page {
 #endif
 };
 
-typedef unsigned long __nocast vm_flags_t;
-
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
  * conditions.  These are held in a global tree and are pinned by the VMAs that
@@ -111,7 +109,7 @@ typedef unsigned long __nocast vm_flags_t;
  */
 struct vm_region {
 	struct rb_node	vm_rb;		/* link in global region tree */
-	vm_flags_t	vm_flags;	/* VMA vm_flags */
+	unsigned long	vm_flags;	/* VMA vm_flags */
 	unsigned long	vm_start;	/* start address of region */
 	unsigned long	vm_end;		/* region initialised to here */
 	unsigned long	vm_top;		/* region allocated to here */
diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h
index 648c9c58add7..3686cd6c9aca 100644
--- a/trunk/include/linux/proc_fs.h
+++ b/trunk/include/linux/proc_fs.h
@@ -179,8 +179,6 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
 
-extern struct file *proc_ns_fget(int fd);
-
 #else
 
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
@@ -243,11 +241,6 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm,
 	       			   struct mm_struct *newmm)
 {}
 
-static inline struct file *proc_ns_fget(int fd)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 #endif /* CONFIG_PROC_FS */
 
 #if !defined(CONFIG_PROC_KCORE)
@@ -259,18 +252,6 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #endif
 
-struct nsproxy;
-struct proc_ns_operations {
-	const char *name;
-	int type;
-	void *(*get)(struct task_struct *task);
-	void (*put)(void *ns);
-	int (*install)(struct nsproxy *nsproxy, void *ns);
-};
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-
 union proc_op {
 	int (*proc_get_link)(struct inode *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
@@ -289,8 +270,6 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
-	void *ns;
-	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
 };
 
diff --git a/trunk/include/linux/syscalls.h b/trunk/include/linux/syscalls.h
index 8c03b98df5f9..ab71447d0c5a 100644
--- a/trunk/include/linux/syscalls.h
+++ b/trunk/include/linux/syscalls.h
@@ -846,5 +846,4 @@ asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
 asmlinkage long sys_open_by_handle_at(int mountdirfd,
 				      struct file_handle __user *handle,
 				      int flags);
-asmlinkage long sys_setns(int fd, int nstype);
 #endif
diff --git a/trunk/include/net/net_namespace.h b/trunk/include/net/net_namespace.h
index dcc8f5749d3f..3ae491932bc8 100644
--- a/trunk/include/net/net_namespace.h
+++ b/trunk/include/net/net_namespace.h
@@ -119,7 +119,6 @@ static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns)
 extern struct list_head net_namespace_list;
 
 extern struct net *get_net_ns_by_pid(pid_t pid);
-extern struct net *get_net_ns_by_fd(int pid);
 
 #ifdef CONFIG_NET_NS
 extern void __put_net(struct net *net);
diff --git a/trunk/include/xen/interface/xen.h b/trunk/include/xen/interface/xen.h
index 70213b4515eb..b33257bc7e83 100644
--- a/trunk/include/xen/interface/xen.h
+++ b/trunk/include/xen/interface/xen.h
@@ -58,7 +58,6 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
-#define __HYPERVISOR_tmem_op              38
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -462,27 +461,6 @@ typedef uint8_t xen_domain_handle_t[16];
 #define __mk_unsigned_long(x) x ## UL
 #define mk_unsigned_long(x) __mk_unsigned_long(x)
 
-#define TMEM_SPEC_VERSION 1
-
-struct tmem_op {
-	uint32_t cmd;
-	int32_t pool_id;
-	union {
-		struct {  /* for cmd == TMEM_NEW_POOL */
-			uint64_t uuid[2];
-			uint32_t flags;
-		} new;
-		struct {
-			uint64_t oid[3];
-			uint32_t index;
-			uint32_t tmem_offset;
-			uint32_t pfn_offset;
-			uint32_t len;
-			GUEST_HANDLE(void) gmfn; /* guest machine page frame */
-		} gen;
-	} u;
-};
-
 #else /* __ASSEMBLY__ */
 
 /* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/trunk/ipc/namespace.c b/trunk/ipc/namespace.c
index ce0a647869b1..8054c8e5faf1 100644
--- a/trunk/ipc/namespace.c
+++ b/trunk/ipc/namespace.c
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
 
 #include "util.h"
 
@@ -141,39 +140,3 @@ void put_ipc_ns(struct ipc_namespace *ns)
 		free_ipc_ns(ns);
 	}
 }
-
-static void *ipcns_get(struct task_struct *task)
-{
-	struct ipc_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
-	if (nsproxy)
-		ns = get_ipc_ns(nsproxy->ipc_ns);
-	rcu_read_unlock();
-
-	return ns;
-}
-
-static void ipcns_put(void *ns)
-{
-	return put_ipc_ns(ns);
-}
-
-static int ipcns_install(struct nsproxy *nsproxy, void *ns)
-{
-	/* Ditch state from the old ipc namespace */
-	exit_sem(current);
-	put_ipc_ns(nsproxy->ipc_ns);
-	nsproxy->ipc_ns = get_ipc_ns(ns);
-	return 0;
-}
-
-const struct proc_ns_operations ipcns_operations = {
-	.name		= "ipc",
-	.type		= CLONE_NEWIPC,
-	.get		= ipcns_get,
-	.put		= ipcns_put,
-	.install	= ipcns_install,
-};
diff --git a/trunk/ipc/shm.c b/trunk/ipc/shm.c
index ab3385a21b27..729acb7e3148 100644
--- a/trunk/ipc/shm.c
+++ b/trunk/ipc/shm.c
@@ -347,7 +347,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	struct file * file;
 	char name[13];
 	int id;
-	vm_flags_t acctflag = 0;
+	int acctflag = 0;
 
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
diff --git a/trunk/kernel/nsproxy.c b/trunk/kernel/nsproxy.c
index 5424e37673ed..a05d191ffdd9 100644
--- a/trunk/kernel/nsproxy.c
+++ b/trunk/kernel/nsproxy.c
@@ -22,9 +22,6 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
-#include <linux/proc_fs.h>
-#include <linux/file.h>
-#include <linux/syscalls.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -236,45 +233,6 @@ void exit_task_namespaces(struct task_struct *p)
 	switch_task_namespaces(p, NULL);
 }
 
-SYSCALL_DEFINE2(setns, int, fd, int, nstype)
-{
-	const struct proc_ns_operations *ops;
-	struct task_struct *tsk = current;
-	struct nsproxy *new_nsproxy;
-	struct proc_inode *ei;
-	struct file *file;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	err = -EINVAL;
-	ei = PROC_I(file->f_dentry->d_inode);
-	ops = ei->ns_ops;
-	if (nstype && (ops->type != nstype))
-		goto out;
-
-	new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
-	if (IS_ERR(new_nsproxy)) {
-		err = PTR_ERR(new_nsproxy);
-		goto out;
-	}
-
-	err = ops->install(new_nsproxy, ei->ns);
-	if (err) {
-		free_nsproxy(new_nsproxy);
-		goto out;
-	}
-	switch_task_namespaces(tsk, new_nsproxy);
-out:
-	fput(file);
-	return err;
-}
-
 static int __init nsproxy_cache_init(void)
 {
 	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/trunk/kernel/utsname.c b/trunk/kernel/utsname.c
index bff131b9510a..44646179eaba 100644
--- a/trunk/kernel/utsname.c
+++ b/trunk/kernel/utsname.c
@@ -15,7 +15,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
 
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -80,41 +79,3 @@ void free_uts_ns(struct kref *kref)
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
-
-static void *utsns_get(struct task_struct *task)
-{
-	struct uts_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
-	if (nsproxy) {
-		ns = nsproxy->uts_ns;
-		get_uts_ns(ns);
-	}
-	rcu_read_unlock();
-
-	return ns;
-}
-
-static void utsns_put(void *ns)
-{
-	put_uts_ns(ns);
-}
-
-static int utsns_install(struct nsproxy *nsproxy, void *ns)
-{
-	get_uts_ns(ns);
-	put_uts_ns(nsproxy->uts_ns);
-	nsproxy->uts_ns = ns;
-	return 0;
-}
-
-const struct proc_ns_operations utsns_operations = {
-	.name		= "uts",
-	.type		= CLONE_NEWUTS,
-	.get		= utsns_get,
-	.put		= utsns_put,
-	.install	= utsns_install,
-};
-
diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig
index 8ca47a5ee9c8..e9c0c61f2ddd 100644
--- a/trunk/mm/Kconfig
+++ b/trunk/mm/Kconfig
@@ -347,26 +347,3 @@ config NEED_PER_CPU_KM
 	depends on !SMP
 	bool
 	default y
-
-config CLEANCACHE
-	bool "Enable cleancache driver to cache clean pages if tmem is present"
-	default n
-	help
-	  Cleancache can be thought of as a page-granularity victim cache
-	  for clean pages that the kernel's pageframe replacement algorithm
-	  (PFRA) would like to keep around, but can't since there isn't enough
-	  memory.  So when the PFRA "evicts" a page, it first attempts to use
-	  cleancacne code to put the data contained in that page into
-	  "transcendent memory", memory that is not directly accessible or
-	  addressable by the kernel and is of unknown and possibly
-	  time-varying size.  And when a cleancache-enabled
-	  filesystem wishes to access a page in a file on disk, it first
-	  checks cleancache to see if it already contains it; if it does,
-	  the page is copied into the kernel and a disk access is avoided.
-	  When a transcendent memory driver is available (such as zcache or
-	  Xen transcendent memory), a significant I/O reduction
-	  may be achieved.  When none is available, all cleancache calls
-	  are reduced to a single pointer-compare-against-NULL resulting
-	  in a negligible performance hit.
-
-	  If unsure, say Y to enable cleancache
diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile
index 836e4163c1bf..42a8326c3e3d 100644
--- a/trunk/mm/Makefile
+++ b/trunk/mm/Makefile
@@ -49,4 +49,3 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
-obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/trunk/mm/cleancache.c b/trunk/mm/cleancache.c
deleted file mode 100644
index bcaae4c2a770..000000000000
--- a/trunk/mm/cleancache.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Cleancache frontend
- *
- * This code provides the generic "frontend" layer to call a matching
- * "backend" driver implementation of cleancache.  See
- * Documentation/vm/cleancache.txt for more information.
- *
- * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
- * Author: Dan Magenheimer
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/exportfs.h>
-#include <linux/mm.h>
-#include <linux/cleancache.h>
-
-/*
- * This global enablement flag may be read thousands of times per second
- * by cleancache_get/put/flush even on systems where cleancache_ops
- * is not claimed (e.g. cleancache is config'ed on but remains
- * disabled), so is preferred to the slower alternative: a function
- * call that checks a non-global.
- */
-int cleancache_enabled;
-EXPORT_SYMBOL(cleancache_enabled);
-
-/*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
- * to the cleancache "backend" implementation functions.
- */
-static struct cleancache_ops cleancache_ops;
-
-/* useful stats available in /sys/kernel/mm/cleancache */
-static unsigned long cleancache_succ_gets;
-static unsigned long cleancache_failed_gets;
-static unsigned long cleancache_puts;
-static unsigned long cleancache_flushes;
-
-/*
- * register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting
- */
-struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
-{
-	struct cleancache_ops old = cleancache_ops;
-
-	cleancache_ops = *ops;
-	cleancache_enabled = 1;
-	return old;
-}
-EXPORT_SYMBOL(cleancache_register_ops);
-
-/* Called by a cleancache-enabled filesystem at time of mount */
-void __cleancache_init_fs(struct super_block *sb)
-{
-	sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
-}
-EXPORT_SYMBOL(__cleancache_init_fs);
-
-/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
-{
-	sb->cleancache_poolid =
-		(*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
-}
-EXPORT_SYMBOL(__cleancache_init_shared_fs);
-
-/*
- * If the filesystem uses exportable filehandles, use the filehandle as
- * the key, else use the inode number.
- */
-static int cleancache_get_key(struct inode *inode,
-			      struct cleancache_filekey *key)
-{
-	int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
-	int len = 0, maxlen = CLEANCACHE_KEY_MAX;
-	struct super_block *sb = inode->i_sb;
-
-	key->u.ino = inode->i_ino;
-	if (sb->s_export_op != NULL) {
-		fhfn = sb->s_export_op->encode_fh;
-		if  (fhfn) {
-			struct dentry d;
-			d.d_inode = inode;
-			len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
-			if (len <= 0 || len == 255)
-				return -1;
-			if (maxlen > CLEANCACHE_KEY_MAX)
-				return -1;
-		}
-	}
-	return 0;
-}
-
-/*
- * "Get" data from cleancache associated with the poolid/inode/index
- * that were specified when the data was put to cleanache and, if
- * successful, use it to fill the specified page with data and return 0.
- * The pageframe is unchanged and returns -1 if the get fails.
- * Page must be locked by caller.
- */
-int __cleancache_get_page(struct page *page)
-{
-	int ret = -1;
-	int pool_id;
-	struct cleancache_filekey key = { .u.key = { 0 } };
-
-	VM_BUG_ON(!PageLocked(page));
-	pool_id = page->mapping->host->i_sb->cleancache_poolid;
-	if (pool_id < 0)
-		goto out;
-
-	if (cleancache_get_key(page->mapping->host, &key) < 0)
-		goto out;
-
-	ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
-	if (ret == 0)
-		cleancache_succ_gets++;
-	else
-		cleancache_failed_gets++;
-out:
-	return ret;
-}
-EXPORT_SYMBOL(__cleancache_get_page);
-
-/*
- * "Put" data from a page to cleancache and associate it with the
- * (previously-obtained per-filesystem) poolid and the page's,
- * inode and page index.  Page must be locked.  Note that a put_page
- * always "succeeds", though a subsequent get_page may succeed or fail.
- */
-void __cleancache_put_page(struct page *page)
-{
-	int pool_id;
-	struct cleancache_filekey key = { .u.key = { 0 } };
-
-	VM_BUG_ON(!PageLocked(page));
-	pool_id = page->mapping->host->i_sb->cleancache_poolid;
-	if (pool_id >= 0 &&
-	      cleancache_get_key(page->mapping->host, &key) >= 0) {
-		(*cleancache_ops.put_page)(pool_id, key, page->index, page);
-		cleancache_puts++;
-	}
-}
-EXPORT_SYMBOL(__cleancache_put_page);
-
-/*
- * Flush any data from cleancache associated with the poolid and the
- * page's inode and page index so that a subsequent "get" will fail.
- */
-void __cleancache_flush_page(struct address_space *mapping, struct page *page)
-{
-	/* careful... page->mapping is NULL sometimes when this is called */
-	int pool_id = mapping->host->i_sb->cleancache_poolid;
-	struct cleancache_filekey key = { .u.key = { 0 } };
-
-	if (pool_id >= 0) {
-		VM_BUG_ON(!PageLocked(page));
-		if (cleancache_get_key(mapping->host, &key) >= 0) {
-			(*cleancache_ops.flush_page)(pool_id, key, page->index);
-			cleancache_flushes++;
-		}
-	}
-}
-EXPORT_SYMBOL(__cleancache_flush_page);
-
-/*
- * Flush all data from cleancache associated with the poolid and the
- * mappings's inode so that all subsequent gets to this poolid/inode
- * will fail.
- */
-void __cleancache_flush_inode(struct address_space *mapping)
-{
-	int pool_id = mapping->host->i_sb->cleancache_poolid;
-	struct cleancache_filekey key = { .u.key = { 0 } };
-
-	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
-		(*cleancache_ops.flush_inode)(pool_id, key);
-}
-EXPORT_SYMBOL(__cleancache_flush_inode);
-
-/*
- * Called by any cleancache-enabled filesystem at time of unmount;
- * note that pool_id is surrendered and may be reutrned by a subsequent
- * cleancache_init_fs or cleancache_init_shared_fs
- */
-void __cleancache_flush_fs(struct super_block *sb)
-{
-	if (sb->cleancache_poolid >= 0) {
-		int old_poolid = sb->cleancache_poolid;
-		sb->cleancache_poolid = -1;
-		(*cleancache_ops.flush_fs)(old_poolid);
-	}
-}
-EXPORT_SYMBOL(__cleancache_flush_fs);
-
-#ifdef CONFIG_SYSFS
-
-/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
-
-#define CLEANCACHE_SYSFS_RO(_name) \
-	static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-		return sprintf(buf, "%lu\n", cleancache_##_name); \
-	} \
-	static struct kobj_attribute cleancache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = cleancache_##_name##_show, \
-	}
-
-CLEANCACHE_SYSFS_RO(succ_gets);
-CLEANCACHE_SYSFS_RO(failed_gets);
-CLEANCACHE_SYSFS_RO(puts);
-CLEANCACHE_SYSFS_RO(flushes);
-
-static struct attribute *cleancache_attrs[] = {
-	&cleancache_succ_gets_attr.attr,
-	&cleancache_failed_gets_attr.attr,
-	&cleancache_puts_attr.attr,
-	&cleancache_flushes_attr.attr,
-	NULL,
-};
-
-static struct attribute_group cleancache_attr_group = {
-	.attrs = cleancache_attrs,
-	.name = "cleancache",
-};
-
-#endif /* CONFIG_SYSFS */
-
-static int __init init_cleancache(void)
-{
-#ifdef CONFIG_SYSFS
-	int err;
-
-	err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
-#endif /* CONFIG_SYSFS */
-	return 0;
-}
-module_init(init_cleancache)
diff --git a/trunk/mm/filemap.c b/trunk/mm/filemap.c
index 7455ccd8bda8..68e782b3d3de 100644
--- a/trunk/mm/filemap.c
+++ b/trunk/mm/filemap.c
@@ -34,7 +34,6 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
-#include <linux/cleancache.h>
 #include "internal.h"
 
 /*
@@ -119,16 +118,6 @@ void __delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	/*
-	 * if we're uptodate, flush out into the cleancache, otherwise
-	 * invalidate any existing cleancache entries.  We can't leave
-	 * stale data around in the cleancache once our page is gone
-	 */
-	if (PageUptodate(page) && PageMappedToDisk(page))
-		cleancache_put_page(page);
-	else
-		cleancache_flush_page(mapping, page);
-
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
diff --git a/trunk/mm/fremap.c b/trunk/mm/fremap.c
index b8e0e2d468af..7f4123056e06 100644
--- a/trunk/mm/fremap.c
+++ b/trunk/mm/fremap.c
@@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		/*
 		 * drop PG_Mlocked flag for over-mapped range
 		 */
-		vm_flags_t saved_flags = vma->vm_flags;
+		unsigned int saved_flags = vma->vm_flags;
 		munlock_vma_pages_range(vma, start, start + size);
 		vma->vm_flags = saved_flags;
 	}
diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c
index f33bb319b73f..5fd68b95c671 100644
--- a/trunk/mm/hugetlb.c
+++ b/trunk/mm/hugetlb.c
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
 					struct vm_area_struct *vma,
-					vm_flags_t vm_flags)
+					int acctflag)
 {
 	long ret, chg;
 	struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * attempt will be made for VM_NORESERVE to allocate a page
 	 * and filesystem quota without using reserves
 	 */
-	if (vm_flags & VM_NORESERVE)
+	if (acctflag & VM_NORESERVE)
 		return 0;
 
 	/*
diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c
index fc24f7d788bd..b73f677f0bb1 100644
--- a/trunk/mm/memory.c
+++ b/trunk/mm/memory.c
@@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 	add_taint(TAINT_BAD_PAGE);
 }
 
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline int is_cow_mapping(unsigned int flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
diff --git a/trunk/mm/mlock.c b/trunk/mm/mlock.c
index 048260c4e02e..516b2c2ddd5a 100644
--- a/trunk/mm/mlock.c
+++ b/trunk/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
  * For vmas that pass the filters, merge/split as appropriate.
  */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-	unsigned long start, unsigned long end, vm_flags_t newflags)
+	unsigned long start, unsigned long end, unsigned int newflags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int nr_pages;
 	int ret = 0;
-	int lock = !!(newflags & VM_LOCKED);
+	int lock = newflags & VM_LOCKED;
 
 	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 		prev = vma;
 
 	for (nstart = start ; ; ) {
-		vm_flags_t newflags;
+		unsigned int newflags;
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
 		goto out;
 
 	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
-		vm_flags_t newflags;
+		unsigned int newflags;
 
 		newflags = vma->vm_flags | VM_LOCKED;
 		if (!(flags & MCL_CURRENT))
diff --git a/trunk/mm/mmap.c b/trunk/mm/mmap.c
index bbdc9af5e117..ac2631b7477f 100644
--- a/trunk/mm/mmap.c
+++ b/trunk/mm/mmap.c
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
 	struct mm_struct * mm = current->mm;
 	struct inode *inode;
-	vm_flags_t vm_flags;
+	unsigned int vm_flags;
 	int error;
 	unsigned long reqprot = prot;
 
@@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-	vm_flags_t vm_flags = vma->vm_flags;
+	unsigned int vm_flags = vma->vm_flags;
 
 	/* If it was private or non-writable, the write bit is already clear */
 	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
  * We account for memory if it's a private writeable mapping,
  * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 {
 	/*
 	 * hugetlb has its own accounting separate from the core VM
@@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
-			  vm_flags_t vm_flags, unsigned long pgoff)
+			  unsigned int vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
diff --git a/trunk/mm/slub.c b/trunk/mm/slub.c
index 7be0223531b0..4aad32d2e60d 100644
--- a/trunk/mm/slub.c
+++ b/trunk/mm/slub.c
@@ -1831,6 +1831,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	page->inuse = page->objects;
 	page->freelist = NULL;
 
+unlock_out:
 	slab_unlock(page);
 	c->tid = next_tid(c->tid);
 	local_irq_restore(flags);
diff --git a/trunk/mm/truncate.c b/trunk/mm/truncate.c
index 3a29a6180212..a95667529135 100644
--- a/trunk/mm/truncate.c
+++ b/trunk/mm/truncate.c
@@ -19,7 +19,6 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
-#include <linux/cleancache.h>
 #include "internal.h"
 
 
@@ -52,7 +51,6 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-	cleancache_flush_page(page->mapping, page);
 	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
@@ -216,7 +214,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	pgoff_t next;
 	int i;
 
-	cleancache_flush_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
@@ -294,7 +291,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		pagevec_release(&pvec);
 		mem_cgroup_uncharge_end();
 	}
-	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 
@@ -444,7 +440,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	int did_range_unmap = 0;
 	int wrapped = 0;
 
-	cleancache_flush_inode(mapping);
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next <= end && !wrapped &&
@@ -503,7 +498,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
-	cleancache_flush_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/trunk/net/core/net_namespace.c b/trunk/net/core/net_namespace.c
index 6c6b86d0da15..2e2dce6583e1 100644
--- a/trunk/net/core/net_namespace.c
+++ b/trunk/net/core/net_namespace.c
@@ -8,8 +8,6 @@
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
-#include <linux/proc_fs.h>
-#include <linux/file.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -304,28 +302,6 @@ void __put_net(struct net *net)
 }
 EXPORT_SYMBOL_GPL(__put_net);
 
-struct net *get_net_ns_by_fd(int fd)
-{
-	struct proc_inode *ei;
-	struct file *file;
-	struct net *net;
-
-	net = ERR_PTR(-EINVAL);
-	file = proc_ns_fget(fd);
-	if (!file)
-		goto out;
-
-	ei = PROC_I(file->f_dentry->d_inode);
-	if (ei->ns_ops != &netns_operations)
-		goto out;
-
-	net = get_net(ei->ns);
-out:
-	if (file)
-		fput(file);
-	return net;
-}
-
 #else
 struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 {
@@ -333,11 +309,6 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 		return ERR_PTR(-EINVAL);
 	return old_net;
 }
-
-struct net *get_net_ns_by_fd(int fd)
-{
-	return ERR_PTR(-EINVAL);
-}
 #endif
 
 struct net *get_net_ns_by_pid(pid_t pid)
@@ -590,39 +561,3 @@ void unregister_pernet_device(struct pernet_operations *ops)
 	mutex_unlock(&net_mutex);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
-
-#ifdef CONFIG_NET_NS
-static void *netns_get(struct task_struct *task)
-{
-	struct net *net = NULL;
-	struct nsproxy *nsproxy;
-
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
-	if (nsproxy)
-		net = get_net(nsproxy->net_ns);
-	rcu_read_unlock();
-
-	return net;
-}
-
-static void netns_put(void *ns)
-{
-	put_net(ns);
-}
-
-static int netns_install(struct nsproxy *nsproxy, void *ns)
-{
-	put_net(nsproxy->net_ns);
-	nsproxy->net_ns = get_net(ns);
-	return 0;
-}
-
-const struct proc_ns_operations netns_operations = {
-	.name		= "net",
-	.type		= CLONE_NEWNET,
-	.get		= netns_get,
-	.put		= netns_put,
-	.install	= netns_install,
-};
-#endif
diff --git a/trunk/net/core/rtnetlink.c b/trunk/net/core/rtnetlink.c
index abd936d8a716..2d56cb9b0b94 100644
--- a/trunk/net/core/rtnetlink.c
+++ b/trunk/net/core/rtnetlink.c
@@ -1046,7 +1046,6 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKMODE]		= { .type = NLA_U8 },
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
-	[IFLA_NET_NS_FD]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
 	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },
 	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },
@@ -1095,8 +1094,6 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 	 */
 	if (tb[IFLA_NET_NS_PID])
 		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
-	else if (tb[IFLA_NET_NS_FD])
-		net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
 	else
 		net = get_net(src_net);
 	return net;
@@ -1227,7 +1224,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	int send_addr_notify = 0;
 	int err;
 
-	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
+	if (tb[IFLA_NET_NS_PID]) {
 		struct net *net = rtnl_link_get_net(dev_net(dev), tb);
 		if (IS_ERR(net)) {
 			err = PTR_ERR(net);