From 8c8836f2f464d2ff4e717ba39557fe4c1ef09a21 Mon Sep 17 00:00:00 2001
From: Cesar Eduardo Barros <cesarb@cesarb.net>
Date: Sun, 16 Dec 2012 21:11:54 +0100
Subject: [PATCH] --- yaml --- r: 344911 b: refs/heads/master c:
 8547a5bc104496d54c66355df944c348e6525e3f h: refs/heads/master i:   344909:
 d661cedfa4803d640bbc74693aa810d69063e04b   344907:
 ca0b6589fd8c90b43306b642df2d617dedacb3c6   344903:
 72f512d82a0246e345b3c5be933e909d38f49519   344895:
 67feb969e832a180ec6a11d0d5c7e99f7c80046c v: v3

---
 [refs]                                       |    2 +-
 trunk/Documentation/filesystems/ext4.txt     |    9 +-
 trunk/Documentation/kernel-parameters.txt    |    3 -
 trunk/Documentation/prctl/seccomp_filter.txt |   74 +-
 trunk/Documentation/security/keys.txt        |   17 -
 trunk/MAINTAINERS                            |    2 +-
 trunk/arch/sh/mm/Kconfig                     |    1 -
 trunk/arch/x86/Kconfig                       |    2 -
 trunk/arch/x86/include/asm/pgtable.h         |   17 +-
 trunk/arch/x86/include/asm/pgtable_types.h   |   20 -
 trunk/arch/x86/kernel/vsyscall_64.c          |  110 +-
 trunk/arch/x86/mm/pgtable.c                  |    8 +-
 trunk/drivers/bus/Kconfig                    |    1 -
 trunk/drivers/char/tpm/tpm_ibmvtpm.c         |   81 +-
 trunk/drivers/char/tpm/tpm_ibmvtpm.h         |    5 +-
 trunk/drivers/input/keyboard/Kconfig         |    1 -
 trunk/drivers/usb/phy/Kconfig                |    1 -
 trunk/drivers/video/omap2/Kconfig            |    4 -
 trunk/drivers/w1/masters/Kconfig             |    1 -
 trunk/drivers/xen/swiotlb-xen.c              |   25 +-
 trunk/fs/Kconfig                             |    4 +-
 trunk/fs/cifs/cifsacl.c                      |   12 +-
 trunk/fs/ext4/Kconfig                        |   15 +
 trunk/fs/ext4/Makefile                       |    4 +-
 trunk/fs/ext4/acl.c                          |    6 +-
 trunk/fs/ext4/dir.c                          |   41 +-
 trunk/fs/ext4/ext4.h                         |  165 +-
 trunk/fs/ext4/ext4_extents.h                 |   40 +
 trunk/fs/ext4/ext4_jbd2.h                    |    7 +
 trunk/fs/ext4/extents.c                      |  480 +++--
 trunk/fs/ext4/extents_status.c               |  500 -----
 trunk/fs/ext4/extents_status.h               |   45 -
 trunk/fs/ext4/file.c                         |  336 +---
 trunk/fs/ext4/fsync.c                        |    6 +-
 trunk/fs/ext4/ialloc.c                       |    6 +-
 trunk/fs/ext4/indirect.c                     |    5 +-
 trunk/fs/ext4/inline.c                       | 1884 ------------------
 trunk/fs/ext4/inode.c                        |  629 +++---
 trunk/fs/ext4/mballoc.c                      |   60 +-
 trunk/fs/ext4/migrate.c                      |    1 -
 trunk/fs/ext4/move_extent.c                  |    1 -
 trunk/fs/ext4/namei.c                        |  531 ++---
 trunk/fs/ext4/page-io.c                      |    3 +-
 trunk/fs/ext4/resize.c                       |   17 +-
 trunk/fs/ext4/super.c                        |   57 +-
 trunk/fs/ext4/symlink.c                      |    4 +
 trunk/fs/ext4/xattr.c                        |  110 +-
 trunk/fs/ext4/xattr.h                        |  158 +-
 trunk/fs/jbd2/journal.c                      |    1 +
 trunk/fs/jbd2/transaction.c                  |   11 +
 trunk/fs/nfs/idmap.c                         |   12 +-
 trunk/include/asm-generic/pgtable.h          |  110 -
 trunk/include/linux/cred.h                   |   17 +-
 trunk/include/linux/huge_mm.h                |   16 +-
 trunk/include/linux/hugetlb.h                |    8 +-
 trunk/include/linux/jbd2.h                   |    9 +-
 trunk/include/linux/key.h                    |    1 -
 trunk/include/linux/mempolicy.h              |    8 -
 trunk/include/linux/migrate.h                |   46 +-
 trunk/include/linux/mm.h                     |   39 -
 trunk/include/linux/mm_types.h               |   31 -
 trunk/include/linux/mmzone.h                 |   13 -
 trunk/include/linux/rmap.h                   |   33 +-
 trunk/include/linux/sched.h                  |   27 -
 trunk/include/linux/swiotlb.h                |   20 +-
 trunk/include/linux/vm_event_item.h          |   12 +-
 trunk/include/linux/vmstat.h                 |    8 -
 trunk/include/trace/events/ext4.h            |  136 +-
 trunk/include/trace/events/migrate.h         |   51 -
 trunk/include/uapi/linux/mempolicy.h         |   15 +-
 trunk/init/Kconfig                           |   44 -
 trunk/kernel/cred.c                          |  127 +-
 trunk/kernel/fork.c                          |    3 -
 trunk/kernel/sched/core.c                    |   71 +-
 trunk/kernel/sched/fair.c                    |  227 ---
 trunk/kernel/sched/features.h                |   11 -
 trunk/kernel/sched/sched.h                   |   12 -
 trunk/kernel/seccomp.c                       |   13 +-
 trunk/kernel/sysctl.c                        |   45 +-
 trunk/lib/swiotlb.c                          |  269 ++-
 trunk/mm/compaction.c                        |   15 +-
 trunk/mm/huge_memory.c                       |  108 +-
 trunk/mm/hugetlb.c                           |   10 +-
 trunk/mm/internal.h                          |    7 +-
 trunk/mm/ksm.c                               |    6 +-
 trunk/mm/memcontrol.c                        |    7 +-
 trunk/mm/memory-failure.c                    |    7 +-
 trunk/mm/memory.c                            |  198 +-
 trunk/mm/memory_hotplug.c                    |    3 +-
 trunk/mm/mempolicy.c                         |  283 +--
 trunk/mm/migrate.c                           |  337 +---
 trunk/mm/mmap.c                              |   10 +-
 trunk/mm/mprotect.c                          |  135 +-
 trunk/mm/mremap.c                            |    2 +-
 trunk/mm/page_alloc.c                        |   10 +-
 trunk/mm/pgtable-generic.c                   |    9 +-
 trunk/mm/rmap.c                              |   66 +-
 trunk/mm/vmstat.c                            |   16 +-
 trunk/net/dns_resolver/dns_key.c             |   15 +-
 trunk/security/keys/key.c                    |    6 +-
 trunk/security/keys/keyctl.c                 |   15 +-
 trunk/security/keys/keyring.c                |   10 +-
 trunk/security/keys/process_keys.c           |   92 +-
 trunk/security/keys/request_key.c            |   21 +-
 trunk/security/smack/Kconfig                 |    6 +-
 trunk/security/smack/smackfs.c               |   17 -
 trunk/security/yama/yama_lsm.c               |   88 +-
 107 files changed, 1801 insertions(+), 6655 deletions(-)
 delete mode 100644 trunk/fs/ext4/extents_status.c
 delete mode 100644 trunk/fs/ext4/extents_status.h
 delete mode 100644 trunk/fs/ext4/inline.c
 delete mode 100644 trunk/include/trace/events/migrate.h

diff --git a/[refs] b/[refs]
index 53eddc986812..e1e3f7f6c4bf 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 9b690c3d56ce15dd265b6398f9d8d58c29c17032
+refs/heads/master: 8547a5bc104496d54c66355df944c348e6525e3f
diff --git a/trunk/Documentation/filesystems/ext4.txt b/trunk/Documentation/filesystems/ext4.txt
index 34ea4f1fa6ea..104322bf378c 100644
--- a/trunk/Documentation/filesystems/ext4.txt
+++ b/trunk/Documentation/filesystems/ext4.txt
@@ -200,9 +200,12 @@ inode_readahead_blks=n	This tuning parameter controls the maximum
 			table readahead algorithm will pre-read into
 			the buffer cache.  The default value is 32 blocks.
 
-nouser_xattr		Disables Extended User Attributes.  See the
-			attr(5) manual page and http://acl.bestbits.at/
-			for more information about extended attributes.
+nouser_xattr		Disables Extended User Attributes. If you have extended
+			attribute support enabled in the kernel configuration
+			(CONFIG_EXT4_FS_XATTR), extended attribute support
+			is enabled by default on mount. See the attr(5) manual
+			page and http://acl.bestbits.at/ for more information
+			about extended attributes.
 
 noacl			This option disables POSIX Access Control List
 			support. If ACL support is enabled in the kernel
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
index ea8e5b485576..20e248cc03a9 100644
--- a/trunk/Documentation/kernel-parameters.txt
+++ b/trunk/Documentation/kernel-parameters.txt
@@ -2032,9 +2032,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nr_uarts=	[SERIAL] maximum number of UARTs to be registered.
 
-	numa_balancing=	[KNL,X86] Enable or disable automatic NUMA balancing.
-			Allowed values are enable and disable
-
 	numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
 			one of ['zone', 'node', 'default'] can be specified
 			This can be set from sysctl after boot.
diff --git a/trunk/Documentation/prctl/seccomp_filter.txt b/trunk/Documentation/prctl/seccomp_filter.txt
index 1e469ef75778..597c3c581375 100644
--- a/trunk/Documentation/prctl/seccomp_filter.txt
+++ b/trunk/Documentation/prctl/seccomp_filter.txt
@@ -95,15 +95,12 @@ SECCOMP_RET_KILL:
 
 SECCOMP_RET_TRAP:
 	Results in the kernel sending a SIGSYS signal to the triggering
-	task without executing the system call.  siginfo->si_call_addr
-	will show the address of the system call instruction, and
-	siginfo->si_syscall and siginfo->si_arch will indicate which
-	syscall was attempted.  The program counter will be as though
-	the syscall happened (i.e. it will not point to the syscall
-	instruction).  The return value register will contain an arch-
-	dependent value -- if resuming execution, set it to something
-	sensible.  (The architecture dependency is because replacing
-	it with -ENOSYS could overwrite some useful information.)
+	task without executing the system call.  The kernel will
+	rollback the register state to just before the system call
+	entry such that a signal handler in the task will be able to
+	inspect the ucontext_t->uc_mcontext registers and emulate
+	system call success or failure upon return from the signal
+	handler.
 
 	The SECCOMP_RET_DATA portion of the return value will be passed
 	as si_errno.
@@ -126,18 +123,6 @@ SECCOMP_RET_TRACE:
 	the BPF program return value will be available to the tracer
 	via PTRACE_GETEVENTMSG.
 
-	The tracer can skip the system call by changing the syscall number
-	to -1.  Alternatively, the tracer can change the system call
-	requested by changing the system call to a valid syscall number.  If
-	the tracer asks to skip the system call, then the system call will
-	appear to return the value that the tracer puts in the return value
-	register.
-
-	The seccomp check will not be run again after the tracer is
-	notified.  (This means that seccomp-based sandboxes MUST NOT
-	allow use of ptrace, even of other sandboxed processes, without
-	extreme care; ptracers can use this mechanism to escape.)
-
 SECCOMP_RET_ALLOW:
 	Results in the system call being executed.
 
@@ -176,50 +161,3 @@ architecture supports both ptrace_event and seccomp, it will be able to
 support seccomp filter with minor fixup: SIGSYS support and seccomp return
 value checking.  Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
 to its arch-specific Kconfig.
-
-
-
-Caveats
--------
-
-The vDSO can cause some system calls to run entirely in userspace,
-leading to surprises when you run programs on different machines that
-fall back to real syscalls.  To minimize these surprises on x86, make
-sure you test with
-/sys/devices/system/clocksource/clocksource0/current_clocksource set to
-something like acpi_pm.
-
-On x86-64, vsyscall emulation is enabled by default.  (vsyscalls are
-legacy variants on vDSO calls.)  Currently, emulated vsyscalls will honor seccomp, with a few oddities:
-
-- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
-  the vsyscall entry for the given call and not the address after the
-  'syscall' instruction.  Any code which wants to restart the call
-  should be aware that (a) a ret instruction has been emulated and (b)
-  trying to resume the syscall will again trigger the standard vsyscall
-  emulation security checks, making resuming the syscall mostly
-  pointless.
-
-- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
-  but the syscall may not be changed to another system call using the
-  orig_rax register. It may only be changed to -1 order to skip the
-  currently emulated call. Any other change MAY terminate the process.
-  The rip value seen by the tracer will be the syscall entry address;
-  this is different from normal behavior.  The tracer MUST NOT modify
-  rip or rsp.  (Do not rely on other changes terminating the process.
-  They might work.  For example, on some kernels, choosing a syscall
-  that only exists in future kernels will be correctly emulated (by
-  returning -ENOSYS).
-
-To detect this quirky behavior, check for addr & ~0x0C00 ==
-0xFFFFFFFFFF600000.  (For SECCOMP_RET_TRACE, use rip.  For
-SECCOMP_RET_TRAP, use siginfo->si_call_addr.)  Do not check any other
-condition: future kernels may improve vsyscall emulation and current
-kernels in vsyscall=native mode will behave differently, but the
-instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
-cases.
-
-Note that modern systems are unlikely to use vsyscalls at all -- they
-are a legacy feature and they are considerably slower than standard
-syscalls.  New code will use the vDSO, and vDSO-issued system calls
-are indistinguishable from normal system calls.
diff --git a/trunk/Documentation/security/keys.txt b/trunk/Documentation/security/keys.txt
index 7b4145d00452..7d9ca92022d8 100644
--- a/trunk/Documentation/security/keys.txt
+++ b/trunk/Documentation/security/keys.txt
@@ -994,23 +994,6 @@ payload contents" for more information.
     reference pointer if successful.
 
 
-(*) A keyring can be created by:
-
-	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-				  const struct cred *cred,
-				  key_perm_t perm,
-				  unsigned long flags,
-				  struct key *dest);
-
-    This creates a keyring with the given attributes and returns it.  If dest
-    is not NULL, the new keyring will be linked into the keyring to which it
-    points.  No permission checks are made upon the destination keyring.
-
-    Error EDQUOT can be returned if the keyring would overload the quota (pass
-    KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted
-    towards the user's quota).  Error ENOMEM can also be returned.
-
-
 (*) To check the validity of a key, this function can be called:
 
 	int validate_key(struct key *key);
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index f71d2f901a69..d9c31b906ac9 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -3712,7 +3712,7 @@ I2C/SMBUS STUB DRIVER
 M:	"Mark M. Hoffman" <mhoffman@lightlink.com>
 L:	linux-i2c@vger.kernel.org
 S:	Maintained
-F:	drivers/i2c/busses/i2c-stub.c
+F:	drivers/i2c/i2c-stub.c
 
 I2C SUBSYSTEM
 M:	Wolfram Sang <w.sang@pengutronix.de>
diff --git a/trunk/arch/sh/mm/Kconfig b/trunk/arch/sh/mm/Kconfig
index 0f7c852f355c..cb8f9920f4dd 100644
--- a/trunk/arch/sh/mm/Kconfig
+++ b/trunk/arch/sh/mm/Kconfig
@@ -111,7 +111,6 @@ config VSYSCALL
 config NUMA
 	bool "Non Uniform Memory Access (NUMA) Support"
 	depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
-	select ARCH_WANT_NUMA_VARIABLE_LOCALITY
 	default n
 	help
 	  Some SH systems have many various memories scattered around
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index 97f8c5ad8c2d..65a872bf72f9 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -22,8 +22,6 @@ config X86
 	def_bool y
 	select HAVE_AOUT if X86_32
 	select HAVE_UNSTABLE_SCHED_CLOCK
-	select ARCH_SUPPORTS_NUMA_BALANCING
-	select ARCH_WANTS_PROT_NUMA_PROT_NONE
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h
index 5199db2923d3..a1f780d45f76 100644
--- a/trunk/arch/x86/include/asm/pgtable.h
+++ b/trunk/arch/x86/include/asm/pgtable.h
@@ -404,14 +404,7 @@ static inline int pte_same(pte_t a, pte_t b)
 
 static inline int pte_present(pte_t a)
 {
-	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-			       _PAGE_NUMA);
-}
-
-#define pte_accessible pte_accessible
-static inline int pte_accessible(pte_t a)
-{
-	return pte_flags(a) & _PAGE_PRESENT;
+	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 static inline int pte_hidden(pte_t pte)
@@ -427,8 +420,7 @@ static inline int pmd_present(pmd_t pmd)
 	 * the _PAGE_PSE flag will remain set at all times while the
 	 * _PAGE_PRESENT bit is clear).
 	 */
-	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-				 _PAGE_NUMA);
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
 static inline int pmd_none(pmd_t pmd)
@@ -487,11 +479,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-#ifdef CONFIG_NUMA_BALANCING
-	/* pmd_numa check */
-	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
-		return 0;
-#endif
 	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
diff --git a/trunk/arch/x86/include/asm/pgtable_types.h b/trunk/arch/x86/include/asm/pgtable_types.h
index 3c32db8c539d..ec8a1fc9505d 100644
--- a/trunk/arch/x86/include/asm/pgtable_types.h
+++ b/trunk/arch/x86/include/asm/pgtable_types.h
@@ -64,26 +64,6 @@
 #define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
-/*
- * _PAGE_NUMA indicates that this page will trigger a numa hinting
- * minor page fault to gather numa placement statistics (see
- * pte_numa()). The bit picked (8) is within the range between
- * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
- * require changes to the swp entry format because that bit is always
- * zero when the pte is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- *
- * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
- * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
- * couldn't reach, like handle_mm_fault() (see access_error in
- * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
- * handle_mm_fault() to be invoked).
- */
-#define _PAGE_NUMA	_PAGE_PROTNONE
-
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
diff --git a/trunk/arch/x86/kernel/vsyscall_64.c b/trunk/arch/x86/kernel/vsyscall_64.c
index 9a907a67be8f..3a3e8c9e280d 100644
--- a/trunk/arch/x86/kernel/vsyscall_64.c
+++ b/trunk/arch/x86/kernel/vsyscall_64.c
@@ -145,6 +145,19 @@ static int addr_to_vsyscall_nr(unsigned long addr)
 	return nr;
 }
 
+#ifdef CONFIG_SECCOMP
+static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
+{
+	if (!seccomp_mode(&tsk->seccomp))
+		return 0;
+	task_pt_regs(tsk)->orig_ax = syscall_nr;
+	task_pt_regs(tsk)->ax = syscall_nr;
+	return __secure_computing(syscall_nr);
+}
+#else
+#define vsyscall_seccomp(_tsk, _nr) 0
+#endif
+
 static bool write_ok_or_segv(unsigned long ptr, size_t size)
 {
 	/*
@@ -177,9 +190,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
 	struct task_struct *tsk;
 	unsigned long caller;
-	int vsyscall_nr, syscall_nr, tmp;
+	int vsyscall_nr;
 	int prev_sig_on_uaccess_error;
 	long ret;
+	int skip;
 
 	/*
 	 * No point in checking CS -- the only way to get here is a user mode
@@ -211,84 +225,56 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	}
 
 	tsk = current;
+	/*
+	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+	 * preserve that behavior to make writing exploits harder.
+	 */
+	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+	current_thread_info()->sig_on_uaccess_error = 1;
 
 	/*
-	 * Check for access_ok violations and find the syscall nr.
-	 *
 	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
 	 * 64-bit, so we don't need to special-case it here.  For all the
 	 * vsyscalls, NULL means "don't write anything" not "write it at
 	 * address 0".
 	 */
+	ret = -EFAULT;
+	skip = 0;
 	switch (vsyscall_nr) {
 	case 0:
-		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
-		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
-			ret = -EFAULT;
-			goto check_fault;
-		}
-
-		syscall_nr = __NR_gettimeofday;
-		break;
-
-	case 1:
-		if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
-			ret = -EFAULT;
-			goto check_fault;
-		}
-
-		syscall_nr = __NR_time;
-		break;
-
-	case 2:
-		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
-		    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
-			ret = -EFAULT;
-			goto check_fault;
-		}
-
-		syscall_nr = __NR_getcpu;
-		break;
-	}
-
-	/*
-	 * Handle seccomp.  regs->ip must be the original value.
-	 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
-	 *
-	 * We could optimize the seccomp disabled case, but performance
-	 * here doesn't matter.
-	 */
-	regs->orig_ax = syscall_nr;
-	regs->ax = -ENOSYS;
-	tmp = secure_computing(syscall_nr);
-	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
-		warn_bad_vsyscall(KERN_DEBUG, regs,
-				  "seccomp tried to change syscall nr or ip");
-		do_exit(SIGSYS);
-	}
-	if (tmp)
-		goto do_ret;  /* skip requested */
+		skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
+		if (skip)
+			break;
 
-	/*
-	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
-	 * preserve that behavior to make writing exploits harder.
-	 */
-	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
-	current_thread_info()->sig_on_uaccess_error = 1;
+		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+			break;
 
-	ret = -EFAULT;
-	switch (vsyscall_nr) {
-	case 0:
 		ret = sys_gettimeofday(
 			(struct timeval __user *)regs->di,
 			(struct timezone __user *)regs->si);
 		break;
 
 	case 1:
+		skip = vsyscall_seccomp(tsk, __NR_time);
+		if (skip)
+			break;
+
+		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+			break;
+
 		ret = sys_time((time_t __user *)regs->di);
 		break;
 
 	case 2:
+		skip = vsyscall_seccomp(tsk, __NR_getcpu);
+		if (skip)
+			break;
+
+		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
+			break;
+
 		ret = sys_getcpu((unsigned __user *)regs->di,
 				 (unsigned __user *)regs->si,
 				 NULL);
@@ -297,7 +283,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
 
-check_fault:
+	if (skip) {
+		if ((long)regs->ax <= 0L) /* seccomp errno emulation */
+			goto do_ret;
+		goto done; /* seccomp trace/trap */
+	}
+
 	if (ret == -EFAULT) {
 		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
 		warn_bad_vsyscall(KERN_INFO, regs,
@@ -320,6 +311,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	/* Emulate a ret instruction. */
 	regs->ip = caller;
 	regs->sp += 8;
+done:
 	return true;
 
 sigsegv:
diff --git a/trunk/arch/x86/mm/pgtable.c b/trunk/arch/x86/mm/pgtable.c
index e27fbf887f3b..217eb705fac0 100644
--- a/trunk/arch/x86/mm/pgtable.c
+++ b/trunk/arch/x86/mm/pgtable.c
@@ -301,13 +301,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	free_page((unsigned long)pgd);
 }
 
-/*
- * Used to set accessed or dirty bits in the page table entries
- * on other architectures. On x86, the accessed and dirty bits
- * are tracked by hardware. However, do_wp_page calls this function
- * to also make the pte writeable at the same time the dirty bit is
- * set. In that case we do actually need to write the PTE.
- */
 int ptep_set_access_flags(struct vm_area_struct *vma,
 			  unsigned long address, pte_t *ptep,
 			  pte_t entry, int dirty)
@@ -317,6 +310,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	if (changed && dirty) {
 		*ptep = entry;
 		pte_update_defer(vma->vm_mm, address, ptep);
+		flush_tlb_page(vma, address);
 	}
 
 	return changed;
diff --git a/trunk/drivers/bus/Kconfig b/trunk/drivers/bus/Kconfig
index 0f51ed687dc8..bbec35d21fe5 100644
--- a/trunk/drivers/bus/Kconfig
+++ b/trunk/drivers/bus/Kconfig
@@ -6,7 +6,6 @@ menu "Bus devices"
 
 config OMAP_OCP2SCP
 	tristate "OMAP OCP2SCP DRIVER"
-	depends on ARCH_OMAP2PLUS
 	help
 	  Driver to enable ocp2scp module which transforms ocp interface
 	  protocol to scp protocol. In OMAP4, USB PHY is connected via
diff --git a/trunk/drivers/char/tpm/tpm_ibmvtpm.c b/trunk/drivers/char/tpm/tpm_ibmvtpm.c
index 9978609d93b2..7da840d487d2 100644
--- a/trunk/drivers/char/tpm/tpm_ibmvtpm.c
+++ b/trunk/drivers/char/tpm/tpm_ibmvtpm.c
@@ -38,6 +38,8 @@ static struct vio_device_id tpm_ibmvtpm_device_table[] = {
 };
 MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table);
 
+DECLARE_WAIT_QUEUE_HEAD(wq);
+
 /**
  * ibmvtpm_send_crq - Send a CRQ request
  * @vdev:	vio device struct
@@ -81,7 +83,6 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
 {
 	struct ibmvtpm_dev *ibmvtpm;
 	u16 len;
-	int sig;
 
 	ibmvtpm = (struct ibmvtpm_dev *)chip->vendor.data;
 
@@ -90,23 +91,22 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
 		return 0;
 	}
 
-	sig = wait_event_interruptible(ibmvtpm->wq, ibmvtpm->res_len != 0);
-	if (sig)
-		return -EINTR;
-
-	len = ibmvtpm->res_len;
+	wait_event_interruptible(wq, ibmvtpm->crq_res.len != 0);
 
-	if (count < len) {
+	if (count < ibmvtpm->crq_res.len) {
 		dev_err(ibmvtpm->dev,
 			"Invalid size in recv: count=%ld, crq_size=%d\n",
-			count, len);
+			count, ibmvtpm->crq_res.len);
 		return -EIO;
 	}
 
 	spin_lock(&ibmvtpm->rtce_lock);
-	memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, len);
-	memset(ibmvtpm->rtce_buf, 0, len);
-	ibmvtpm->res_len = 0;
+	memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, ibmvtpm->crq_res.len);
+	memset(ibmvtpm->rtce_buf, 0, ibmvtpm->crq_res.len);
+	ibmvtpm->crq_res.valid = 0;
+	ibmvtpm->crq_res.msg = 0;
+	len = ibmvtpm->crq_res.len;
+	ibmvtpm->crq_res.len = 0;
 	spin_unlock(&ibmvtpm->rtce_lock);
 	return len;
 }
@@ -273,6 +273,7 @@ static int tpm_ibmvtpm_remove(struct vio_dev *vdev)
 	int rc = 0;
 
 	free_irq(vdev->irq, ibmvtpm);
+	tasklet_kill(&ibmvtpm->tasklet);
 
 	do {
 		if (rc)
@@ -371,6 +372,7 @@ static int ibmvtpm_reset_crq(struct ibmvtpm_dev *ibmvtpm)
 static int tpm_ibmvtpm_resume(struct device *dev)
 {
 	struct ibmvtpm_dev *ibmvtpm = ibmvtpm_get_data(dev);
+	unsigned long flags;
 	int rc = 0;
 
 	do {
@@ -385,11 +387,10 @@ static int tpm_ibmvtpm_resume(struct device *dev)
 		return rc;
 	}
 
-	rc = vio_enable_interrupts(ibmvtpm->vdev);
-	if (rc) {
-		dev_err(dev, "Error vio_enable_interrupts rc=%d\n", rc);
-		return rc;
-	}
+	spin_lock_irqsave(&ibmvtpm->lock, flags);
+	vio_disable_interrupts(ibmvtpm->vdev);
+	tasklet_schedule(&ibmvtpm->tasklet);
+	spin_unlock_irqrestore(&ibmvtpm->lock, flags);
 
 	rc = ibmvtpm_crq_send_init(ibmvtpm);
 	if (rc)
@@ -466,7 +467,7 @@ static struct ibmvtpm_crq *ibmvtpm_crq_get_next(struct ibmvtpm_dev *ibmvtpm)
 	if (crq->valid & VTPM_MSG_RES) {
 		if (++crq_q->index == crq_q->num_entry)
 			crq_q->index = 0;
-		smp_rmb();
+		rmb();
 	} else
 		crq = NULL;
 	return crq;
@@ -534,9 +535,11 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq,
 			ibmvtpm->vtpm_version = crq->data;
 			return;
 		case VTPM_TPM_COMMAND_RES:
-			/* len of the data in rtce buffer */
-			ibmvtpm->res_len = crq->len;
-			wake_up_interruptible(&ibmvtpm->wq);
+			ibmvtpm->crq_res.valid = crq->valid;
+			ibmvtpm->crq_res.msg = crq->msg;
+			ibmvtpm->crq_res.len = crq->len;
+			ibmvtpm->crq_res.data = crq->data;
+			wake_up_interruptible(&wq);
 			return;
 		default:
 			return;
@@ -556,19 +559,38 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq,
 static irqreturn_t ibmvtpm_interrupt(int irq, void *vtpm_instance)
 {
 	struct ibmvtpm_dev *ibmvtpm = (struct ibmvtpm_dev *) vtpm_instance;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ibmvtpm->lock, flags);
+	vio_disable_interrupts(ibmvtpm->vdev);
+	tasklet_schedule(&ibmvtpm->tasklet);
+	spin_unlock_irqrestore(&ibmvtpm->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * ibmvtpm_tasklet - Interrupt handler tasklet
+ * @data:	ibm vtpm device struct
+ *
+ * Returns:
+ *	Nothing
+ **/
+static void ibmvtpm_tasklet(void *data)
+{
+	struct ibmvtpm_dev *ibmvtpm = data;
 	struct ibmvtpm_crq *crq;
+	unsigned long flags;
 
-	/* while loop is needed for initial setup (get version and
-	 * get rtce_size). There should be only one tpm request at any
-	 * given time.
-	 */
+	spin_lock_irqsave(&ibmvtpm->lock, flags);
 	while ((crq = ibmvtpm_crq_get_next(ibmvtpm)) != NULL) {
 		ibmvtpm_crq_process(crq, ibmvtpm);
 		crq->valid = 0;
-		smp_wmb();
+		wmb();
 	}
 
-	return IRQ_HANDLED;
+	vio_enable_interrupts(ibmvtpm->vdev);
+	spin_unlock_irqrestore(&ibmvtpm->lock, flags);
 }
 
 /**
@@ -628,6 +650,9 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
 		goto reg_crq_cleanup;
 	}
 
+	tasklet_init(&ibmvtpm->tasklet, (void *)ibmvtpm_tasklet,
+		     (unsigned long)ibmvtpm);
+
 	rc = request_irq(vio_dev->irq, ibmvtpm_interrupt, 0,
 			 tpm_ibmvtpm_driver_name, ibmvtpm);
 	if (rc) {
@@ -641,14 +666,13 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
 		goto init_irq_cleanup;
 	}
 
-	init_waitqueue_head(&ibmvtpm->wq);
-
 	crq_q->index = 0;
 
 	ibmvtpm->dev = dev;
 	ibmvtpm->vdev = vio_dev;
 	chip->vendor.data = (void *)ibmvtpm;
 
+	spin_lock_init(&ibmvtpm->lock);
 	spin_lock_init(&ibmvtpm->rtce_lock);
 
 	rc = ibmvtpm_crq_send_init(ibmvtpm);
@@ -665,6 +689,7 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
 
 	return rc;
 init_irq_cleanup:
+	tasklet_kill(&ibmvtpm->tasklet);
 	do {
 		rc1 = plpar_hcall_norets(H_FREE_CRQ, vio_dev->unit_address);
 	} while (rc1 == H_BUSY || H_IS_LONG_BUSY(rc1));
diff --git a/trunk/drivers/char/tpm/tpm_ibmvtpm.h b/trunk/drivers/char/tpm/tpm_ibmvtpm.h
index bd82a791f995..4296eb4b4d82 100644
--- a/trunk/drivers/char/tpm/tpm_ibmvtpm.h
+++ b/trunk/drivers/char/tpm/tpm_ibmvtpm.h
@@ -38,12 +38,13 @@ struct ibmvtpm_dev {
 	struct vio_dev *vdev;
 	struct ibmvtpm_crq_queue crq_queue;
 	dma_addr_t crq_dma_handle;
+	spinlock_t lock;
+	struct tasklet_struct tasklet;
 	u32 rtce_size;
 	void __iomem *rtce_buf;
 	dma_addr_t rtce_dma_handle;
 	spinlock_t rtce_lock;
-	wait_queue_head_t wq;
-	u16 res_len;
+	struct ibmvtpm_crq crq_res;
 	u32 vtpm_version;
 };
 
diff --git a/trunk/drivers/input/keyboard/Kconfig b/trunk/drivers/input/keyboard/Kconfig
index febead4bf8a5..77629d33f03f 100644
--- a/trunk/drivers/input/keyboard/Kconfig
+++ b/trunk/drivers/input/keyboard/Kconfig
@@ -544,7 +544,6 @@ config KEYBOARD_OMAP
 
 config KEYBOARD_OMAP4
 	tristate "TI OMAP4+ keypad support"
-	depends on ARCH_OMAP2PLUS
 	select INPUT_MATRIXKMAP
 	help
 	  Say Y here if you want to use the OMAP4+ keypad.
diff --git a/trunk/drivers/usb/phy/Kconfig b/trunk/drivers/usb/phy/Kconfig
index 5de6e7f39f9c..7eb73c561bd2 100644
--- a/trunk/drivers/usb/phy/Kconfig
+++ b/trunk/drivers/usb/phy/Kconfig
@@ -6,7 +6,6 @@ comment "USB Physical Layer drivers"
 
 config OMAP_USB2
 	tristate "OMAP USB2 PHY Driver"
-	depends on ARCH_OMAP2PLUS
 	select USB_OTG_UTILS
 	help
 	  Enable this to support the transceiver that is part of SOC. This
diff --git a/trunk/drivers/video/omap2/Kconfig b/trunk/drivers/video/omap2/Kconfig
index b07b2b042e7e..346d67d6cf4d 100644
--- a/trunk/drivers/video/omap2/Kconfig
+++ b/trunk/drivers/video/omap2/Kconfig
@@ -1,10 +1,6 @@
 config OMAP2_VRFB
 	bool
 
-if ARCH_OMAP2PLUS
-
 source "drivers/video/omap2/dss/Kconfig"
 source "drivers/video/omap2/omapfb/Kconfig"
 source "drivers/video/omap2/displays/Kconfig"
-
-endif
diff --git a/trunk/drivers/w1/masters/Kconfig b/trunk/drivers/w1/masters/Kconfig
index e8ca63a82b97..c433a746e3f5 100644
--- a/trunk/drivers/w1/masters/Kconfig
+++ b/trunk/drivers/w1/masters/Kconfig
@@ -60,7 +60,6 @@ config W1_MASTER_GPIO
 
 config HDQ_MASTER_OMAP
 	tristate "OMAP HDQ driver"
-	depends on ARCH_OMAP
 	help
 	  Say Y here if you want support for the 1-wire or HDQ Interface
 	  on an OMAP processor.
diff --git a/trunk/drivers/xen/swiotlb-xen.c b/trunk/drivers/xen/swiotlb-xen.c
index af47e7594460..58db6df866ef 100644
--- a/trunk/drivers/xen/swiotlb-xen.c
+++ b/trunk/drivers/xen/swiotlb-xen.c
@@ -338,8 +338,9 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 				enum dma_data_direction dir,
 				struct dma_attrs *attrs)
 {
-	phys_addr_t map, phys = page_to_phys(page) + offset;
+	phys_addr_t phys = page_to_phys(page) + offset;
 	dma_addr_t dev_addr = xen_phys_to_bus(phys);
+	void *map;
 
 	BUG_ON(dir == DMA_NONE);
 	/*
@@ -355,10 +356,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
 	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
-	if (map == SWIOTLB_MAP_ERROR)
+	if (!map)
 		return DMA_ERROR_CODE;
 
-	dev_addr = xen_phys_to_bus(map);
+	dev_addr = xen_virt_to_bus(map);
 
 	/*
 	 * Ensure that the address returned is DMA'ble
@@ -388,7 +389,7 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(dev_addr)) {
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
+		swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
 		return;
 	}
 
@@ -433,7 +434,8 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(dev_addr)) {
-		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
+		swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+				       target);
 		return;
 	}
 
@@ -492,12 +494,11 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 		if (swiotlb_force ||
 		    !dma_capable(hwdev, dev_addr, sg->length) ||
 		    range_straddles_page_boundary(paddr, sg->length)) {
-			phys_addr_t map = swiotlb_tbl_map_single(hwdev,
-								 start_dma_addr,
-								 sg_phys(sg),
-								 sg->length,
-								 dir);
-			if (map == SWIOTLB_MAP_ERROR) {
+			void *map = swiotlb_tbl_map_single(hwdev,
+							   start_dma_addr,
+							   sg_phys(sg),
+							   sg->length, dir);
+			if (!map) {
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
 				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
@@ -505,7 +506,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 				sgl[0].dma_length = 0;
 				return DMA_ERROR_CODE;
 			}
-			sg->dma_address = xen_phys_to_bus(map);
+			sg->dma_address = xen_virt_to_bus(map);
 		} else
 			sg->dma_address = dev_addr;
 		sg->dma_length = sg->length;
diff --git a/trunk/fs/Kconfig b/trunk/fs/Kconfig
index eaff24a19502..f95ae3a027f3 100644
--- a/trunk/fs/Kconfig
+++ b/trunk/fs/Kconfig
@@ -28,8 +28,8 @@ config FS_MBCACHE
 	tristate
 	default y if EXT2_FS=y && EXT2_FS_XATTR
 	default y if EXT3_FS=y && EXT3_FS_XATTR
-	default y if EXT4_FS=y
-	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
+	default y if EXT4_FS=y && EXT4_FS_XATTR
+	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
diff --git a/trunk/fs/cifs/cifsacl.c b/trunk/fs/cifs/cifsacl.c
index 5cbd00e74067..75c1ee699143 100644
--- a/trunk/fs/cifs/cifsacl.c
+++ b/trunk/fs/cifs/cifsacl.c
@@ -346,15 +346,19 @@ init_cifs_idmap(void)
 	if (!cred)
 		return -ENOMEM;
 
-	keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
 	}
 
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
 	ret = register_key_type(&cifs_idmap_key_type);
 	if (ret < 0)
 		goto failed_put_key;
diff --git a/trunk/fs/ext4/Kconfig b/trunk/fs/ext4/Kconfig
index 0a475c881852..c22f17021b6e 100644
--- a/trunk/fs/ext4/Kconfig
+++ b/trunk/fs/ext4/Kconfig
@@ -39,8 +39,22 @@ config EXT4_USE_FOR_EXT23
 	  compiled kernel size by using one file system driver for
 	  ext2, ext3, and ext4 file systems.
 
+config EXT4_FS_XATTR
+	bool "Ext4 extended attributes"
+	depends on EXT4_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext4.
+
 config EXT4_FS_POSIX_ACL
 	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS_XATTR
 	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
@@ -53,6 +67,7 @@ config EXT4_FS_POSIX_ACL
 
 config EXT4_FS_SECURITY
 	bool "Ext4 Security Labels"
+	depends on EXT4_FS_XATTR
 	help
 	  Security labels support alternative access control models
 	  implemented by security modules like SELinux.  This option
diff --git a/trunk/fs/ext4/Makefile b/trunk/fs/ext4/Makefile
index 0310fec2ee3d..56fd8f865930 100644
--- a/trunk/fs/ext4/Makefile
+++ b/trunk/fs/ext4/Makefile
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-		xattr_trusted.o inline.o
+		mmp.o indirect.o
 
+ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/trunk/fs/ext4/acl.c b/trunk/fs/ext4/acl.c
index e6e0d988439b..d3c5b88fd89f 100644
--- a/trunk/fs/ext4/acl.c
+++ b/trunk/fs/ext4/acl.c
@@ -423,10 +423,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 
 retry:
 	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		error = PTR_ERR(handle);
-		goto release_and_out;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 	error = ext4_set_acl(handle, inode, type, acl);
 	ext4_journal_stop(handle);
 	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
diff --git a/trunk/fs/ext4/dir.c b/trunk/fs/ext4/dir.c
index b8d877f6c1fa..8e07d2a5a139 100644
--- a/trunk/fs/ext4/dir.c
+++ b/trunk/fs/ext4/dir.c
@@ -27,11 +27,23 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include "ext4.h"
-#include "xattr.h"
+
+static unsigned char ext4_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
 
 static int ext4_dx_readdir(struct file *filp,
 			   void *dirent, filldir_t filldir);
 
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT4_FT_MAX))
+		return DT_UNKNOWN;
+
+	return (ext4_filetype_table[filetype]);
+}
+
 /**
  * Check if the given dir-inode refers to an htree-indexed directory
  * (or a directory which chould potentially get coverted to use htree
@@ -56,14 +68,11 @@ static int is_dx_dir(struct inode *inode)
  * Return 0 if the directory entry is OK, and 1 if there is a problem
  *
  * Note: this is the opposite of what ext2 and ext3 historically returned...
- *
- * bh passed here can be an inode block or a dir data block, depending
- * on the inode inline data flag.
  */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
 			   struct inode *dir, struct file *filp,
 			   struct ext4_dir_entry_2 *de,
-			   struct buffer_head *bh, char *buf, int size,
+			   struct buffer_head *bh,
 			   unsigned int offset)
 {
 	const char *error_msg = NULL;
@@ -76,8 +85,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 		error_msg = "rec_len % 4 != 0";
 	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
 		error_msg = "rec_len is too small for name_len";
-	else if (unlikely(((char *) de - buf) + rlen > size))
-		error_msg = "directory entry across range";
+	else if (unlikely(((char *) de - bh->b_data) + rlen >
+			  dir->i_sb->s_blocksize))
+		error_msg = "directory entry across blocks";
 	else if (unlikely(le32_to_cpu(de->inode) >
 			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
 		error_msg = "inode out of bounds";
@@ -88,14 +98,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 		ext4_error_file(filp, function, line, bh->b_blocknr,
 				"bad entry in directory: %s - offset=%u(%u), "
 				"inode=%u, rec_len=%d, name_len=%d",
-				error_msg, (unsigned) (offset % size),
+				error_msg, (unsigned) (offset % bh->b_size),
 				offset, le32_to_cpu(de->inode),
 				rlen, de->name_len);
 	else
 		ext4_error_inode(dir, function, line, bh->b_blocknr,
 				"bad entry in directory: %s - offset=%u(%u), "
 				"inode=%u, rec_len=%d, name_len=%d",
-				error_msg, (unsigned) (offset % size),
+				error_msg, (unsigned) (offset % bh->b_size),
 				offset, le32_to_cpu(de->inode),
 				rlen, de->name_len);
 
@@ -115,14 +125,6 @@ static int ext4_readdir(struct file *filp,
 	int ret = 0;
 	int dir_has_error = 0;
 
-	if (ext4_has_inline_data(inode)) {
-		int has_inline_data = 1;
-		ret = ext4_read_inline_dir(filp, dirent, filldir,
-					   &has_inline_data);
-		if (has_inline_data)
-			return ret;
-	}
-
 	if (is_dx_dir(inode)) {
 		err = ext4_dx_readdir(filp, dirent, filldir);
 		if (err != ERR_BAD_DX_DIR) {
@@ -219,9 +221,8 @@ static int ext4_readdir(struct file *filp,
 		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (ext4_check_dir_entry(inode, filp, de, bh,
-						 bh->b_data, bh->b_size,
-						 offset)) {
+			if (ext4_check_dir_entry(inode, filp, de,
+						 bh, offset)) {
 				/*
 				 * On error, skip the f_pos to the next block
 				 */
diff --git a/trunk/fs/ext4/ext4.h b/trunk/fs/ext4/ext4.h
index 8462eb3c33aa..df163da388c9 100644
--- a/trunk/fs/ext4/ext4.h
+++ b/trunk/fs/ext4/ext4.h
@@ -57,16 +57,6 @@
 #define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
-#else
-#define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
-#endif
-
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
 	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
 
@@ -402,7 +392,6 @@ struct flex_groups {
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
-#define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
@@ -459,26 +448,28 @@ enum {
 	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
-	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
 	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
 };
 
-/*
- * Since it's pretty easy to mix up bit numbers and hex values, we use a
- * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
- * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
- * any extra space in the compiled kernel image, otherwise, the build will fail.
- * It's important that these values are the same, since we are using
- * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
- * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
- * values found in ext2, ext3 and ext4 filesystems, and of course the values
- * defined in e2fsprogs.
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+	printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+		EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
  *
  * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
  */
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
-#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
-
 static inline void ext4_check_flag_values(void)
 {
 	CHECK_FLAG_VALUE(SECRM);
@@ -503,7 +494,6 @@ static inline void ext4_check_flag_values(void)
 	CHECK_FLAG_VALUE(EXTENTS);
 	CHECK_FLAG_VALUE(EA_INODE);
 	CHECK_FLAG_VALUE(EOFBLOCKS);
-	CHECK_FLAG_VALUE(INLINE_DATA);
 	CHECK_FLAG_VALUE(RESERVED);
 }
 
@@ -821,8 +811,6 @@ struct ext4_ext_cache {
 	__u32		ec_len; /* must be 32bit to return holes */
 };
 
-#include "extents_status.h"
-
 /*
  * fourth extended file system inode data in memory
  */
@@ -845,6 +833,7 @@ struct ext4_inode_info {
 #endif
 	unsigned long	i_flags;
 
+#ifdef CONFIG_EXT4_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
 	 * data. Taking i_mutex even when reading would cause contention
@@ -853,6 +842,7 @@ struct ext4_inode_info {
 	 * EAs.
 	 */
 	struct rw_semaphore xattr_sem;
+#endif
 
 	struct list_head i_orphan;	/* unlinked but open inodes */
 
@@ -898,10 +888,6 @@ struct ext4_inode_info {
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 
-	/* extents status tree */
-	struct ext4_es_tree i_es_tree;
-	rwlock_t i_es_lock;
-
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
 
@@ -916,10 +902,6 @@ struct ext4_inode_info {
 	/* on-disk additional length */
 	__u16 i_extra_isize;
 
-	/* Indicate the inline data space. */
-	u16 i_inline_off;
-	u16 i_inline_size;
-
 #ifdef CONFIG_QUOTA
 	/* quota space reservation, managed internally by quota code */
 	qsize_t i_reserved_quota;
@@ -1378,7 +1360,6 @@ enum {
 	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
 	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
 					   nolocking */
-	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -1500,7 +1481,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
 #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
-#define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x8000 /* data in inode */
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1524,8 +1505,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-					 EXT4_FEATURE_INCOMPAT_MMP |	\
-					 EXT4_FEATURE_INCOMPAT_INLINE_DATA)
+					 EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1612,11 +1592,6 @@ struct ext4_dir_entry_tail {
 	__le32	det_checksum;		/* crc32c(uuid+inum+dirblock) */
 };
 
-#define EXT4_DIRENT_TAIL(block, blocksize) \
-	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
-					((blocksize) - \
-					 sizeof(struct ext4_dir_entry_tail))))
-
 /*
  * Ext4 directory file types.  Only the low 3 bits are used.  The
  * other bits are reserved for now.
@@ -1961,42 +1936,14 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
 				  struct file *,
 				  struct ext4_dir_entry_2 *,
-				  struct buffer_head *, char *, int,
-				  unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)	\
+				  struct buffer_head *, unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)			\
 	unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
-					(de), (bh), (buf), (size), (offset)))
+					(de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
-extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
-			     struct buffer_head *bh,
-			     void *buf, int buf_size,
-			     const char *name, int namelen,
-			     struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
-			struct ext4_dir_entry_2 *de,
-			int buf_size,
-			const char *name, int namelen);
-static inline void ext4_update_dx_flag(struct inode *inode)
-{
-	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-				     EXT4_FEATURE_COMPAT_DIR_INDEX))
-		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-}
-static unsigned char ext4_filetype_table[] = {
-	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
-{
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
-	    (filetype >= EXT4_FT_MAX))
-		return DT_UNKNOWN;
-
-	return ext4_filetype_table[filetype];
-}
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -2047,23 +1994,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
-int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-			   struct buffer_head *bh, int create);
-int ext4_walk_page_buffers(handle_t *handle,
-			   struct buffer_head *head,
-			   unsigned from,
-			   unsigned to,
-			   int *partial,
-			   int (*fn)(handle_t *handle,
-				     struct buffer_head *bh));
-int do_journal_get_write_access(handle_t *handle,
-				struct buffer_head *bh);
-#define FALL_BACK_TO_NONDELALLOC 1
-#define CONVERT_INLINE_DATA	 2
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2118,20 +2050,6 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 				__u32 start_minor_hash, __u32 *next_hash);
-extern int search_dir(struct buffer_head *bh,
-		      char *search_buf,
-		      int buf_size,
-		      struct inode *dir,
-		      const struct qstr *d_name,
-		      unsigned int offset,
-		      struct ext4_dir_entry_2 **res_dir);
-extern int ext4_generic_delete_entry(handle_t *handle,
-				     struct inode *dir,
-				     struct ext4_dir_entry_2 *de_del,
-				     struct buffer_head *bh,
-				     void *entry_buf,
-				     int buf_size,
-				     int csum_size);
 
 /* resize.c */
 extern int ext4_group_add(struct super_block *sb,
@@ -2458,15 +2376,6 @@ extern void ext4_unwritten_wait(struct inode *inode);
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
 extern struct dentry *ext4_get_parent(struct dentry *child);
-extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
-				 struct ext4_dir_entry_2 *de,
-				 int blocksize, int csum_size,
-				 unsigned int parent_ino, int dotdot_real_len);
-extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-				   unsigned int blocksize);
-extern int ext4_handle_dirty_dirent_node(handle_t *handle,
-					 struct inode *inode,
-					 struct buffer_head *bh);
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2484,9 +2393,6 @@ extern int ext4_check_blockref(const char *, unsigned int,
 			       struct inode *, __le32 *, unsigned int);
 
 /* extents.c */
-struct ext4_ext_path;
-struct ext4_extent;
-
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2504,27 +2410,8 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map, int flags);
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-					 ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
-						   int num,
-						   struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
-				      struct ext4_extent *ex1,
-				      struct ext4_extent *ex2);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *,
-				  struct ext4_ext_path *,
-				  struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
-						  struct ext4_ext_path *);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
-
-
 /* move_extent.c */
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
@@ -2558,10 +2445,14 @@ enum ext4_state_bits {
 				 * never, ever appear in a buffer_head's state
 				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
 				 * this is used. */
+	BH_Da_Mapped,	/* Delayed allocated block that now has a mapping. This
+			 * flag is set when ext4_map_blocks is called on a
+			 * delayed allocated block to get its real mapping. */
 };
 
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
 
 /*
  * Add new method to test whether block and inode bitmaps are properly
@@ -2612,4 +2503,6 @@ extern void ext4_resize_end(struct super_block *sb);
 
 #endif	/* __KERNEL__ */
 
+#include "ext4_extents.h"
+
 #endif	/* _EXT4_H */
diff --git a/trunk/fs/ext4/ext4_extents.h b/trunk/fs/ext4/ext4_extents.h
index 487fda12bc00..cb1b2c919963 100644
--- a/trunk/fs/ext4/ext4_extents.h
+++ b/trunk/fs/ext4/ext4_extents.h
@@ -42,6 +42,16 @@
  */
 #define CHECK_BINSEARCH__
 
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
 /*
  * If EXT_STATS is defined then stats numbers are collected.
  * These number will be displayed at umount time.
@@ -133,6 +143,20 @@ struct ext4_ext_path {
  * structure for external API
  */
 
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
+					struct ext4_ext_cache *,
+					struct ext4_extent *, void *);
+
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
+
 /*
  * Maximum number of logical blocks in a file; ext4_extent's ee_block is
  * __le32.
@@ -276,5 +300,21 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 				     0xffff);
 }
 
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+					 ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+						   int num,
+						   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+				      struct ext4_extent *ex1,
+				      struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+							struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+				      int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/trunk/fs/ext4/ext4_jbd2.h b/trunk/fs/ext4/ext4_jbd2.h
index 7177f9b21cb2..56d258c18303 100644
--- a/trunk/fs/ext4/ext4_jbd2.h
+++ b/trunk/fs/ext4/ext4_jbd2.h
@@ -254,6 +254,13 @@ static inline void ext4_handle_sync(handle_t *handle)
 		handle->h_sync = 1;
 }
 
+static inline void ext4_handle_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_release_buffer(handle, bh);
+}
+
 static inline int ext4_handle_is_aborted(handle_t *handle)
 {
 	if (ext4_handle_valid(handle))
diff --git a/trunk/fs/ext4/extents.c b/trunk/fs/ext4/extents.c
index 26af22832a84..7011ac967208 100644
--- a/trunk/fs/ext4/extents.c
+++ b/trunk/fs/ext4/extents.c
@@ -41,8 +41,6 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
-#include "xattr.h"
 
 #include <trace/events/ext4.h>
 
@@ -111,9 +109,6 @@ static int ext4_split_extent_at(handle_t *handle,
 			     int split_flag,
 			     int flags);
 
-static int ext4_find_delayed_extent(struct inode *inode,
-				    struct ext4_ext_cache *newex);
-
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -1964,33 +1959,27 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	return err;
 }
 
-static int ext4_fill_fiemap_extents(struct inode *inode,
-				    ext4_lblk_t block, ext4_lblk_t num,
-				    struct fiemap_extent_info *fieinfo)
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			       ext4_lblk_t num, ext_prepare_callback func,
+			       void *cbdata)
 {
 	struct ext4_ext_path *path = NULL;
-	struct ext4_ext_cache newex;
+	struct ext4_ext_cache cbex;
 	struct ext4_extent *ex;
-	ext4_lblk_t next, next_del, start = 0, end = 0;
+	ext4_lblk_t next, start = 0, end = 0;
 	ext4_lblk_t last = block + num;
-	int exists, depth = 0, err = 0;
-	unsigned int flags = 0;
-	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
 
 	while (block < last && block != EXT_MAX_BLOCKS) {
 		num = last - block;
 		/* find extent for this block */
 		down_read(&EXT4_I(inode)->i_data_sem);
-
-		if (path && ext_depth(inode) != depth) {
-			/* depth was changed. we have to realloc path */
-			kfree(path);
-			path = NULL;
-		}
-
 		path = ext4_ext_find_extent(inode, block, path);
+		up_read(&EXT4_I(inode)->i_data_sem);
 		if (IS_ERR(path)) {
-			up_read(&EXT4_I(inode)->i_data_sem);
 			err = PTR_ERR(path);
 			path = NULL;
 			break;
@@ -1998,16 +1987,13 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 
 		depth = ext_depth(inode);
 		if (unlikely(path[depth].p_hdr == NULL)) {
-			up_read(&EXT4_I(inode)->i_data_sem);
 			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
 			err = -EIO;
 			break;
 		}
 		ex = path[depth].p_ext;
 		next = ext4_ext_next_allocated_block(path);
-		ext4_ext_drop_refs(path);
 
-		flags = 0;
 		exists = 0;
 		if (!ex) {
 			/* there is no extent yet, so try to allocate
@@ -2044,64 +2030,40 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		BUG_ON(end <= start);
 
 		if (!exists) {
-			newex.ec_block = start;
-			newex.ec_len = end - start;
-			newex.ec_start = 0;
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
 		} else {
-			newex.ec_block = le32_to_cpu(ex->ee_block);
-			newex.ec_len = ext4_ext_get_actual_len(ex);
-			newex.ec_start = ext4_ext_pblock(ex);
-			if (ext4_ext_is_uninitialized(ex))
-				flags |= FIEMAP_EXTENT_UNWRITTEN;
-		}
-
-		/*
-		 * Find delayed extent and update newex accordingly. We call
-		 * it even in !exists case to find out whether newex is the
-		 * last existing extent or not.
-		 */
-		next_del = ext4_find_delayed_extent(inode, &newex);
-		if (!exists && next_del) {
-			exists = 1;
-			flags |= FIEMAP_EXTENT_DELALLOC;
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
+			cbex.ec_start = ext4_ext_pblock(ex);
 		}
-		up_read(&EXT4_I(inode)->i_data_sem);
 
-		if (unlikely(newex.ec_len == 0)) {
-			EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+		if (unlikely(cbex.ec_len == 0)) {
+			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
 			err = -EIO;
 			break;
 		}
+		err = func(inode, next, &cbex, ex, cbdata);
+		ext4_ext_drop_refs(path);
 
-		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
-		if (next == next_del) {
-			flags |= FIEMAP_EXTENT_LAST;
-			if (unlikely(next_del != EXT_MAX_BLOCKS ||
-				     next != EXT_MAX_BLOCKS)) {
-				EXT4_ERROR_INODE(inode,
-						 "next extent == %u, next "
-						 "delalloc extent = %u",
-						 next, next_del);
-				err = -EIO;
-				break;
-			}
+		if (err < 0)
+			break;
+
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
 		}
 
-		if (exists) {
-			err = fiemap_fill_next_extent(fieinfo,
-				(__u64)newex.ec_block << blksize_bits,
-				(__u64)newex.ec_start << blksize_bits,
-				(__u64)newex.ec_len << blksize_bits,
-				flags);
-			if (err < 0)
-				break;
-			if (err == 1) {
-				err = 0;
-				break;
-			}
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
 		}
 
-		block = newex.ec_block + newex.ec_len;
+		block = cbex.ec_block + cbex.ec_len;
 	}
 
 	if (path) {
@@ -2194,6 +2156,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 		  struct ext4_extent *ex)
 {
 	struct ext4_ext_cache *cex;
+	struct ext4_sb_info *sbi;
 	int ret = 0;
 
 	/*
@@ -2201,6 +2164,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 	 */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
+	sbi = EXT4_SB(inode->i_sb);
 
 	/* has cache valid data? */
 	if (cex->ec_len == 0)
@@ -2309,13 +2273,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	int index;
-	int depth;
-
-	/* If we are converting the inline data, only one is needed here. */
-	if (ext4_has_inline_data(inode))
-		return 1;
-
-	depth = ext_depth(inode);
+	int depth = ext_depth(inode);
 
 	if (chunk)
 		index = depth * 2;
@@ -3503,34 +3461,115 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 /**
  * ext4_find_delalloc_range: find delayed allocated block in the given range.
  *
- * Return 1 if there is a delalloc block in the range, otherwise 0.
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
  */
 static int ext4_find_delalloc_range(struct inode *inode,
 				    ext4_lblk_t lblk_start,
-				    ext4_lblk_t lblk_end)
+				    ext4_lblk_t lblk_end,
+				    int search_hint_reverse)
 {
-	struct extent_status es;
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *head, *bh = NULL;
+	struct page *page;
+	ext4_lblk_t i, pg_lblk;
+	pgoff_t index;
 
-	es.start = lblk_start;
-	ext4_es_find_extent(inode, &es);
-	if (es.len == 0)
-		return 0; /* there is no delay extent in this tree */
-	else if (es.start <= lblk_start && lblk_start < es.start + es.len)
-		return 1;
-	else if (lblk_start <= es.start && es.start <= lblk_end)
-		return 1;
-	else
+	if (!test_opt(inode->i_sb, DELALLOC))
 		return 0;
+
+	/* reverse search wont work if fs block size is less than page size */
+	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+		search_hint_reverse = 0;
+
+	if (search_hint_reverse)
+		i = lblk_end;
+	else
+		i = lblk_start;
+
+	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	while ((i >= lblk_start) && (i <= lblk_end)) {
+		page = find_get_page(mapping, index);
+		if (!page)
+			goto nextpage;
+
+		if (!page_has_buffers(page))
+			goto nextpage;
+
+		head = page_buffers(page);
+		if (!head)
+			goto nextpage;
+
+		bh = head;
+		pg_lblk = index << (PAGE_CACHE_SHIFT -
+						inode->i_blkbits);
+		do {
+			if (unlikely(pg_lblk < lblk_start)) {
+				/*
+				 * This is possible when fs block size is less
+				 * than page size and our cluster starts/ends in
+				 * middle of the page. So we need to skip the
+				 * initial few blocks till we reach the 'lblk'
+				 */
+				pg_lblk++;
+				continue;
+			}
+
+			/* Check if the buffer is delayed allocated and that it
+			 * is not yet mapped. (when da-buffers are mapped during
+			 * their writeout, their da_mapped bit is set.)
+			 */
+			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
+				page_cache_release(page);
+				trace_ext4_find_delalloc_range(inode,
+						lblk_start, lblk_end,
+						search_hint_reverse,
+						1, i);
+				return 1;
+			}
+			if (search_hint_reverse)
+				i--;
+			else
+				i++;
+		} while ((i >= lblk_start) && (i <= lblk_end) &&
+				((bh = bh->b_this_page) != head));
+nextpage:
+		if (page)
+			page_cache_release(page);
+		/*
+		 * Move to next page. 'i' will be the first lblk in the next
+		 * page.
+		 */
+		if (search_hint_reverse)
+			index--;
+		else
+			index++;
+		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	}
+
+	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+					search_hint_reverse, 0, 0);
+	return 0;
 }
 
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+			       int search_hint_reverse)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_lblk_t lblk_start, lblk_end;
 	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
 	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
 
-	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+					search_hint_reverse);
 }
 
 /**
@@ -3591,7 +3630,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
 		lblk_to = lblk_from + c_offset - 1;
 
-		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
 			allocated_clusters--;
 	}
 
@@ -3601,7 +3640,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 		lblk_from = lblk_start + num_blks;
 		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
 
-		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
 			allocated_clusters--;
 	}
 
@@ -3624,8 +3663,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
-	trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
-						    allocated, newblock);
+	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+						    newblock);
 
 	/* get_block() before submit the IO, split the extent */
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
@@ -3872,7 +3911,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_extent newex, *ex, *ex2;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_fsblk_t newblock = 0;
-	int free_on_err = 0, err = 0, depth;
+	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
 	unsigned int allocated_clusters = 0;
 	struct ext4_allocation_request ar;
@@ -3888,7 +3927,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
 			if ((sbi->s_cluster_ratio > 1) &&
-			    ext4_find_delalloc_cluster(inode, map->m_lblk))
+			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
 				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3968,15 +4007,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 					ee_len, ee_start);
 				goto out;
 			}
-			allocated = ext4_ext_handle_uninitialized_extents(
+			ret = ext4_ext_handle_uninitialized_extents(
 				handle, inode, map, path, flags,
 				allocated, newblock);
-			goto out3;
+			return ret;
 		}
 	}
 
 	if ((sbi->s_cluster_ratio > 1) &&
-	    ext4_find_delalloc_cluster(inode, map->m_lblk))
+	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
 		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 
 	/*
@@ -4245,8 +4284,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		kfree(path);
 	}
 
-out3:
-	trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+		newblock, map->m_len, err ? err : allocated);
 
 	return err ? err : allocated;
 }
@@ -4305,8 +4344,6 @@ void ext4_ext_truncate(struct inode *inode)
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
-	err = ext4_es_remove_extent(inode, last_block,
-				    EXT_MAX_BLOCKS - last_block);
 	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
 
 	/* In a multi-transaction truncate, we only make the final
@@ -4397,10 +4434,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return ext4_punch_hole(file, offset, len);
 
-	ret = ext4_convert_inline_data(inode);
-	if (ret)
-		return ret;
-
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	map.m_lblk = offset >> blkbits;
 	/*
@@ -4539,43 +4572,206 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 }
 
 /*
- * If newex is not existing extent (newex->ec_start equals zero) find
- * delayed extent at start of newex and update newex accordingly and
- * return start of the next delayed extent.
- *
- * If newex is existing extent (newex->ec_start is not equal zero)
- * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newex unmodified.
+ * Callback function called for each extent to gather FIEMAP information.
  */
-static int ext4_find_delayed_extent(struct inode *inode,
-				    struct ext4_ext_cache *newex)
+static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
+		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+		       void *data)
 {
-	struct extent_status es;
-	ext4_lblk_t next_del;
+	__u64	logical;
+	__u64	physical;
+	__u64	length;
+	__u32	flags = 0;
+	int		ret = 0;
+	struct fiemap_extent_info *fieinfo = data;
+	unsigned char blksize_bits;
 
-	es.start = newex->ec_block;
-	next_del = ext4_es_find_extent(inode, &es);
+	blksize_bits = inode->i_sb->s_blocksize_bits;
+	logical = (__u64)newex->ec_block << blksize_bits;
 
 	if (newex->ec_start == 0) {
 		/*
 		 * No extent in extent-tree contains block @newex->ec_start,
 		 * then the block may stay in 1)a hole or 2)delayed-extent.
+		 *
+		 * Holes or delayed-extents are processed as follows.
+		 * 1. lookup dirty pages with specified range in pagecache.
+		 *    If no page is got, then there is no delayed-extent and
+		 *    return with EXT_CONTINUE.
+		 * 2. find the 1st mapped buffer,
+		 * 3. check if the mapped buffer is both in the request range
+		 *    and a delayed buffer. If not, there is no delayed-extent,
+		 *    then return.
+		 * 4. a delayed-extent is found, the extent will be collected.
 		 */
-		if (es.len == 0)
-			/* A hole found. */
-			return 0;
+		ext4_lblk_t	end = 0;
+		pgoff_t		last_offset;
+		pgoff_t		offset;
+		pgoff_t		index;
+		pgoff_t		start_index = 0;
+		struct page	**pages = NULL;
+		struct buffer_head *bh = NULL;
+		struct buffer_head *head = NULL;
+		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+
+		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (pages == NULL)
+			return -ENOMEM;
 
-		if (es.start > newex->ec_block) {
-			/* A hole found. */
-			newex->ec_len = min(es.start - newex->ec_block,
-					    newex->ec_len);
-			return 0;
+		offset = logical >> PAGE_SHIFT;
+repeat:
+		last_offset = offset;
+		head = NULL;
+		ret = find_get_pages_tag(inode->i_mapping, &offset,
+					PAGECACHE_TAG_DIRTY, nr_pages, pages);
+
+		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+			/* First time, try to find a mapped buffer. */
+			if (ret == 0) {
+out:
+				for (index = 0; index < ret; index++)
+					page_cache_release(pages[index]);
+				/* just a hole. */
+				kfree(pages);
+				return EXT_CONTINUE;
+			}
+			index = 0;
+
+next_page:
+			/* Try to find the 1st mapped buffer. */
+			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
+				  blksize_bits;
+			if (!page_has_buffers(pages[index]))
+				goto out;
+			head = page_buffers(pages[index]);
+			if (!head)
+				goto out;
+
+			index++;
+			bh = head;
+			do {
+				if (end >= newex->ec_block +
+					newex->ec_len)
+					/* The buffer is out of
+					 * the request range.
+					 */
+					goto out;
+
+				if (buffer_mapped(bh) &&
+				    end >= newex->ec_block) {
+					start_index = index - 1;
+					/* get the 1st mapped buffer. */
+					goto found_mapped_buffer;
+				}
+
+				bh = bh->b_this_page;
+				end++;
+			} while (bh != head);
+
+			/* No mapped buffer in the range found in this page,
+			 * We need to look up next page.
+			 */
+			if (index >= ret) {
+				/* There is no page left, but we need to limit
+				 * newex->ec_len.
+				 */
+				newex->ec_len = end - newex->ec_block;
+				goto out;
+			}
+			goto next_page;
+		} else {
+			/*Find contiguous delayed buffers. */
+			if (ret > 0 && pages[0]->index == last_offset)
+				head = page_buffers(pages[0]);
+			bh = head;
+			index = 1;
+			start_index = 0;
+		}
+
+found_mapped_buffer:
+		if (bh != NULL && buffer_delay(bh)) {
+			/* 1st or contiguous delayed buffer found. */
+			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+				/*
+				 * 1st delayed buffer found, record
+				 * the start of extent.
+				 */
+				flags |= FIEMAP_EXTENT_DELALLOC;
+				newex->ec_block = end;
+				logical = (__u64)end << blksize_bits;
+			}
+			/* Find contiguous delayed buffers. */
+			do {
+				if (!buffer_delay(bh))
+					goto found_delayed_extent;
+				bh = bh->b_this_page;
+				end++;
+			} while (bh != head);
+
+			for (; index < ret; index++) {
+				if (!page_has_buffers(pages[index])) {
+					bh = NULL;
+					break;
+				}
+				head = page_buffers(pages[index]);
+				if (!head) {
+					bh = NULL;
+					break;
+				}
+
+				if (pages[index]->index !=
+				    pages[start_index]->index + index
+				    - start_index) {
+					/* Blocks are not contiguous. */
+					bh = NULL;
+					break;
+				}
+				bh = head;
+				do {
+					if (!buffer_delay(bh))
+						/* Delayed-extent ends. */
+						goto found_delayed_extent;
+					bh = bh->b_this_page;
+					end++;
+				} while (bh != head);
+			}
+		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+			/* a hole found. */
+			goto out;
+
+found_delayed_extent:
+		newex->ec_len = min(end - newex->ec_block,
+						(ext4_lblk_t)EXT_INIT_MAX_LEN);
+		if (ret == nr_pages && bh != NULL &&
+			newex->ec_len < EXT_INIT_MAX_LEN &&
+			buffer_delay(bh)) {
+			/* Have not collected an extent and continue. */
+			for (index = 0; index < ret; index++)
+				page_cache_release(pages[index]);
+			goto repeat;
 		}
 
-		newex->ec_len = es.start + es.len - newex->ec_block;
+		for (index = 0; index < ret; index++)
+			page_cache_release(pages[index]);
+		kfree(pages);
 	}
 
-	return next_del;
+	physical = (__u64)newex->ec_start << blksize_bits;
+	length =   (__u64)newex->ec_len << blksize_bits;
+
+	if (ex && ext4_ext_is_uninitialized(ex))
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+	if (next == EXT_MAX_BLOCKS)
+		flags |= FIEMAP_EXTENT_LAST;
+
+	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, flags);
+	if (ret < 0)
+		return ret;
+	if (ret == 1)
+		return EXT_BREAK;
+	return EXT_CONTINUE;
 }
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4775,8 +4971,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	ext4_ext_invalidate_cache(inode);
 	ext4_discard_preallocations(inode);
 
-	err = ext4_es_remove_extent(inode, first_block,
-				    stop_block - first_block);
 	err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
 
 	ext4_ext_invalidate_cache(inode);
@@ -4797,22 +4991,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	mutex_unlock(&inode->i_mutex);
 	return err;
 }
-
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
 	ext4_lblk_t start_blk;
 	int error = 0;
 
-	if (ext4_has_inline_data(inode)) {
-		int has_inline = 1;
-
-		error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
-
-		if (has_inline)
-			return error;
-	}
-
 	/* fallback to generic here if not in extents fmt */
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return generic_block_fiemap(inode, fieinfo, start, len,
@@ -4834,11 +5018,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
 
 		/*
-		 * Walk the extent tree gathering extent information
-		 * and pushing extents back to the user.
+		 * Walk the extent tree gathering extent information.
+		 * ext4_ext_fiemap_cb will push extents back to user.
 		 */
-		error = ext4_fill_fiemap_extents(inode, start_blk,
-						 len_blks, fieinfo);
+		error = ext4_ext_walk_space(inode, start_blk, len_blks,
+					  ext4_ext_fiemap_cb, fieinfo);
 	}
 
 	return error;
diff --git a/trunk/fs/ext4/extents_status.c b/trunk/fs/ext4/extents_status.c
deleted file mode 100644
index 564d981a2fcc..000000000000
--- a/trunk/fs/ext4/extents_status.c
+++ /dev/null
@@ -1,500 +0,0 @@
-/*
- *  fs/ext4/extents_status.c
- *
- * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
- * Modified by
- *	Allison Henderson <achender@linux.vnet.ibm.com>
- *	Hugh Dickins <hughd@google.com>
- *	Zheng Liu <wenqing.lz@taobao.com>
- *
- * Ext4 extents status tree core functions.
- */
-#include <linux/rbtree.h>
-#include "ext4.h"
-#include "extents_status.h"
-#include "ext4_extents.h"
-
-#include <trace/events/ext4.h>
-
-/*
- * According to previous discussion in Ext4 Developer Workshop, we
- * will introduce a new structure called io tree to track all extent
- * status in order to solve some problems that we have met
- * (e.g. Reservation space warning), and provide extent-level locking.
- * Delay extent tree is the first step to achieve this goal.  It is
- * original built by Yongqiang Yang.  At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
- * simplify the implementation of fiemap and bigalloc, and introduce
- * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
- * delay extent tree at the following comment.  But for better
- * understand what it does, it has been rename to extent status tree.
- *
- * Currently the first step has been done.  All delay extents are
- * tracked in the tree.  It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
- * invalidated.  Therefore the implementation of fiemap and bigalloc
- * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
- *
- * The following comment describes the implemenmtation of extent
- * status tree and future works.
- */
-
-/*
- * extents status tree implementation for ext4.
- *
- *
- * ==========================================================================
- * Extents status encompass delayed extents and extent locks
- *
- * 1. Why delayed extent implementation ?
- *
- * Without delayed extent, ext4 identifies a delayed extent by looking
- * up page cache, this has several deficiencies - complicated, buggy,
- * and inefficient code.
- *
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
- *
- * Let us have a look at how they do without delayed extents implementation.
- *   --	FIEMAP
- *	FIEMAP looks up page cache to identify delayed allocations from holes.
- *
- *   --	SEEK_HOLE/DATA
- *	SEEK_HOLE/DATA has the same problem as FIEMAP.
- *
- *   --	bigalloc
- *	bigalloc looks up page cache to figure out if a block is
- *	already under delayed allocation or not to determine whether
- *	quota reserving is needed for the cluster.
- *
- *   -- punch hole
- *	punch hole looks up page cache to identify a delayed extent.
- *
- *   --	writeout
- *	Writeout looks up whole page cache to see if a buffer is
- *	mapped, If there are not very many delayed buffers, then it is
- *	time comsuming.
- *
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
- * bigalloc and writeout can figure out if a block or a range of
- * blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
- *
- *
- * ==========================================================================
- * 2. ext4 delayed extents impelmentation
- *
- *   --	delayed extent
- *	A delayed extent is a range of blocks which are contiguous
- *	logically and under delayed allocation.  Unlike extent in
- *	ext4, delayed extent in ext4 is a in-memory struct, there is
- *	no corresponding on-disk data.  There is no limit on length of
- *	delayed extent, so a delayed extent can contain as many blocks
- *	as they are contiguous logically.
- *
- *   --	delayed extent tree
- *	Every inode has a delayed extent tree and all under delayed
- *	allocation blocks are added to the tree as delayed extents.
- *	Delayed extents in the tree are ordered by logical block no.
- *
- *   --	operations on a delayed extent tree
- *	There are three operations on a delayed extent tree: find next
- *	delayed extent, adding a space(a range of blocks) and removing
- *	a space.
- *
- *   --	race on a delayed extent tree
- *	Delayed extent tree is protected inode->i_es_lock.
- *
- *
- * ==========================================================================
- * 3. performance analysis
- *   --	overhead
- *	1. There is a cache extent for write access, so if writes are
- *	not very random, adding space operaions are in O(1) time.
- *
- *   --	gain
- *	2. Code is much simpler, more readable, more maintainable and
- *	more efficient.
- *
- *
- * ==========================================================================
- * 4. TODO list
- *   -- Track all extent status
- *
- *   -- Improve get block process
- *
- *   -- Extent-level locking
- */
-
-static struct kmem_cache *ext4_es_cachep;
-
-int __init ext4_init_es(void)
-{
-	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
-	if (ext4_es_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-void ext4_exit_es(void)
-{
-	if (ext4_es_cachep)
-		kmem_cache_destroy(ext4_es_cachep);
-}
-
-void ext4_es_init_tree(struct ext4_es_tree *tree)
-{
-	tree->root = RB_ROOT;
-	tree->cache_es = NULL;
-}
-
-#ifdef ES_DEBUG__
-static void ext4_es_print_tree(struct inode *inode)
-{
-	struct ext4_es_tree *tree;
-	struct rb_node *node;
-
-	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
-	tree = &EXT4_I(inode)->i_es_tree;
-	node = rb_first(&tree->root);
-	while (node) {
-		struct extent_status *es;
-		es = rb_entry(node, struct extent_status, rb_node);
-		printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
-		node = rb_next(node);
-	}
-	printk(KERN_DEBUG "\n");
-}
-#else
-#define ext4_es_print_tree(inode)
-#endif
-
-static inline ext4_lblk_t extent_status_end(struct extent_status *es)
-{
-	BUG_ON(es->start + es->len < es->start);
-	return es->start + es->len - 1;
-}
-
-/*
- * search through the tree for an delayed extent with a given offset.  If
- * it can't be found, try to find next extent.
- */
-static struct extent_status *__es_tree_search(struct rb_root *root,
-					      ext4_lblk_t offset)
-{
-	struct rb_node *node = root->rb_node;
-	struct extent_status *es = NULL;
-
-	while (node) {
-		es = rb_entry(node, struct extent_status, rb_node);
-		if (offset < es->start)
-			node = node->rb_left;
-		else if (offset > extent_status_end(es))
-			node = node->rb_right;
-		else
-			return es;
-	}
-
-	if (es && offset < es->start)
-		return es;
-
-	if (es && offset > extent_status_end(es)) {
-		node = rb_next(&es->rb_node);
-		return node ? rb_entry(node, struct extent_status, rb_node) :
-			      NULL;
-	}
-
-	return NULL;
-}
-
-/*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
- *
- * @inode: the inode which owns delayed extents
- * @es: delayed extent that we found
- *
- * Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
- * Delayed extent is returned via @es.
- */
-ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
-{
-	struct ext4_es_tree *tree = NULL;
-	struct extent_status *es1 = NULL;
-	struct rb_node *node;
-	ext4_lblk_t ret = EXT_MAX_BLOCKS;
-
-	trace_ext4_es_find_extent_enter(inode, es->start);
-
-	read_lock(&EXT4_I(inode)->i_es_lock);
-	tree = &EXT4_I(inode)->i_es_tree;
-
-	/* find delay extent in cache firstly */
-	if (tree->cache_es) {
-		es1 = tree->cache_es;
-		if (in_range(es->start, es1->start, es1->len)) {
-			es_debug("%u cached by [%u/%u)\n",
-				 es->start, es1->start, es1->len);
-			goto out;
-		}
-	}
-
-	es->len = 0;
-	es1 = __es_tree_search(&tree->root, es->start);
-
-out:
-	if (es1) {
-		tree->cache_es = es1;
-		es->start = es1->start;
-		es->len = es1->len;
-		node = rb_next(&es1->rb_node);
-		if (node) {
-			es1 = rb_entry(node, struct extent_status, rb_node);
-			ret = es1->start;
-		}
-	}
-
-	read_unlock(&EXT4_I(inode)->i_es_lock);
-
-	trace_ext4_es_find_extent_exit(inode, es, ret);
-	return ret;
-}
-
-static struct extent_status *
-ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
-{
-	struct extent_status *es;
-	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
-	if (es == NULL)
-		return NULL;
-	es->start = start;
-	es->len = len;
-	return es;
-}
-
-static void ext4_es_free_extent(struct extent_status *es)
-{
-	kmem_cache_free(ext4_es_cachep, es);
-}
-
-static struct extent_status *
-ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
-{
-	struct extent_status *es1;
-	struct rb_node *node;
-
-	node = rb_prev(&es->rb_node);
-	if (!node)
-		return es;
-
-	es1 = rb_entry(node, struct extent_status, rb_node);
-	if (es->start == extent_status_end(es1) + 1) {
-		es1->len += es->len;
-		rb_erase(&es->rb_node, &tree->root);
-		ext4_es_free_extent(es);
-		es = es1;
-	}
-
-	return es;
-}
-
-static struct extent_status *
-ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
-{
-	struct extent_status *es1;
-	struct rb_node *node;
-
-	node = rb_next(&es->rb_node);
-	if (!node)
-		return es;
-
-	es1 = rb_entry(node, struct extent_status, rb_node);
-	if (es1->start == extent_status_end(es) + 1) {
-		es->len += es1->len;
-		rb_erase(node, &tree->root);
-		ext4_es_free_extent(es1);
-	}
-
-	return es;
-}
-
-static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
-			      ext4_lblk_t len)
-{
-	struct rb_node **p = &tree->root.rb_node;
-	struct rb_node *parent = NULL;
-	struct extent_status *es;
-	ext4_lblk_t end = offset + len - 1;
-
-	BUG_ON(end < offset);
-	es = tree->cache_es;
-	if (es && offset == (extent_status_end(es) + 1)) {
-		es_debug("cached by [%u/%u)\n", es->start, es->len);
-		es->len += len;
-		es = ext4_es_try_to_merge_right(tree, es);
-		goto out;
-	} else if (es && es->start == end + 1) {
-		es_debug("cached by [%u/%u)\n", es->start, es->len);
-		es->start = offset;
-		es->len += len;
-		es = ext4_es_try_to_merge_left(tree, es);
-		goto out;
-	} else if (es && es->start <= offset &&
-		   end <= extent_status_end(es)) {
-		es_debug("cached by [%u/%u)\n", es->start, es->len);
-		goto out;
-	}
-
-	while (*p) {
-		parent = *p;
-		es = rb_entry(parent, struct extent_status, rb_node);
-
-		if (offset < es->start) {
-			if (es->start == end + 1) {
-				es->start = offset;
-				es->len += len;
-				es = ext4_es_try_to_merge_left(tree, es);
-				goto out;
-			}
-			p = &(*p)->rb_left;
-		} else if (offset > extent_status_end(es)) {
-			if (offset == extent_status_end(es) + 1) {
-				es->len += len;
-				es = ext4_es_try_to_merge_right(tree, es);
-				goto out;
-			}
-			p = &(*p)->rb_right;
-		} else {
-			if (extent_status_end(es) <= end)
-				es->len = offset - es->start + len;
-			goto out;
-		}
-	}
-
-	es = ext4_es_alloc_extent(offset, len);
-	if (!es)
-		return -ENOMEM;
-	rb_link_node(&es->rb_node, parent, p);
-	rb_insert_color(&es->rb_node, &tree->root);
-
-out:
-	tree->cache_es = es;
-	return 0;
-}
-
-/*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
- *
- * Return 0 on success, error code on failure.
- */
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
-			  ext4_lblk_t len)
-{
-	struct ext4_es_tree *tree;
-	int err = 0;
-
-	trace_ext4_es_insert_extent(inode, offset, len);
-	es_debug("add [%u/%u) to extent status tree of inode %lu\n",
-		 offset, len, inode->i_ino);
-
-	write_lock(&EXT4_I(inode)->i_es_lock);
-	tree = &EXT4_I(inode)->i_es_tree;
-	err = __es_insert_extent(tree, offset, len);
-	write_unlock(&EXT4_I(inode)->i_es_lock);
-
-	ext4_es_print_tree(inode);
-
-	return err;
-}
-
-/*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
- *
- * Return 0 on success, error code on failure.
- */
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
-			  ext4_lblk_t len)
-{
-	struct rb_node *node;
-	struct ext4_es_tree *tree;
-	struct extent_status *es;
-	struct extent_status orig_es;
-	ext4_lblk_t len1, len2, end;
-	int err = 0;
-
-	trace_ext4_es_remove_extent(inode, offset, len);
-	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
-		 offset, len, inode->i_ino);
-
-	end = offset + len - 1;
-	BUG_ON(end < offset);
-	write_lock(&EXT4_I(inode)->i_es_lock);
-	tree = &EXT4_I(inode)->i_es_tree;
-	es = __es_tree_search(&tree->root, offset);
-	if (!es)
-		goto out;
-	if (es->start > end)
-		goto out;
-
-	/* Simply invalidate cache_es. */
-	tree->cache_es = NULL;
-
-	orig_es.start = es->start;
-	orig_es.len = es->len;
-	len1 = offset > es->start ? offset - es->start : 0;
-	len2 = extent_status_end(es) > end ?
-	       extent_status_end(es) - end : 0;
-	if (len1 > 0)
-		es->len = len1;
-	if (len2 > 0) {
-		if (len1 > 0) {
-			err = __es_insert_extent(tree, end + 1, len2);
-			if (err) {
-				es->start = orig_es.start;
-				es->len = orig_es.len;
-				goto out;
-			}
-		} else {
-			es->start = end + 1;
-			es->len = len2;
-		}
-		goto out;
-	}
-
-	if (len1 > 0) {
-		node = rb_next(&es->rb_node);
-		if (node)
-			es = rb_entry(node, struct extent_status, rb_node);
-		else
-			es = NULL;
-	}
-
-	while (es && extent_status_end(es) <= end) {
-		node = rb_next(&es->rb_node);
-		rb_erase(&es->rb_node, &tree->root);
-		ext4_es_free_extent(es);
-		if (!node) {
-			es = NULL;
-			break;
-		}
-		es = rb_entry(node, struct extent_status, rb_node);
-	}
-
-	if (es && es->start < end + 1) {
-		len1 = extent_status_end(es) - end;
-		es->start = end + 1;
-		es->len = len1;
-	}
-
-out:
-	write_unlock(&EXT4_I(inode)->i_es_lock);
-	ext4_es_print_tree(inode);
-	return err;
-}
diff --git a/trunk/fs/ext4/extents_status.h b/trunk/fs/ext4/extents_status.h
deleted file mode 100644
index 077f82db092a..000000000000
--- a/trunk/fs/ext4/extents_status.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  fs/ext4/extents_status.h
- *
- * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
- * Modified by
- *	Allison Henderson <achender@linux.vnet.ibm.com>
- *	Zheng Liu <wenqing.lz@taobao.com>
- *
- */
-
-#ifndef _EXT4_EXTENTS_STATUS_H
-#define _EXT4_EXTENTS_STATUS_H
-
-/*
- * Turn on ES_DEBUG__ to get lots of info about extent status operations.
- */
-#ifdef ES_DEBUG__
-#define es_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
-#else
-#define es_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
-#endif
-
-struct extent_status {
-	struct rb_node rb_node;
-	ext4_lblk_t start;	/* first block extent covers */
-	ext4_lblk_t len;	/* length of extent in block */
-};
-
-struct ext4_es_tree {
-	struct rb_root root;
-	struct extent_status *cache_es;	/* recently accessed extent */
-};
-
-extern int __init ext4_init_es(void);
-extern void ext4_exit_es(void);
-extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
-				 ext4_lblk_t len);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
-				 ext4_lblk_t len);
-extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
-				struct extent_status *es);
-
-#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c
index b64a60bf105a..bf3966bccd34 100644
--- a/trunk/fs/ext4/file.c
+++ b/trunk/fs/ext4/file.c
@@ -24,7 +24,6 @@
 #include <linux/mount.h>
 #include <linux/path.h>
 #include <linux/quotaops.h>
-#include <linux/pagevec.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -286,324 +285,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	return dquot_file_open(inode, filp);
 }
 
-/*
- * Here we use ext4_map_blocks() to get a block mapping for a extent-based
- * file rather than ext4_ext_walk_space() because we can introduce
- * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
- * function.  When extent status tree has been fully implemented, it will
- * track all extent status for a file and we can directly use it to
- * retrieve the offset for SEEK_DATA/SEEK_HOLE.
- */
-
-/*
- * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
- * lookup page cache to check whether or not there has some data between
- * [startoff, endoff] because, if this range contains an unwritten extent,
- * we determine this extent as a data or a hole according to whether the
- * page cache has data or not.
- */
-static int ext4_find_unwritten_pgoff(struct inode *inode,
-				     int origin,
-				     struct ext4_map_blocks *map,
-				     loff_t *offset)
-{
-	struct pagevec pvec;
-	unsigned int blkbits;
-	pgoff_t index;
-	pgoff_t end;
-	loff_t endoff;
-	loff_t startoff;
-	loff_t lastoff;
-	int found = 0;
-
-	blkbits = inode->i_sb->s_blocksize_bits;
-	startoff = *offset;
-	lastoff = startoff;
-	endoff = (map->m_lblk + map->m_len) << blkbits;
-
-	index = startoff >> PAGE_CACHE_SHIFT;
-	end = endoff >> PAGE_CACHE_SHIFT;
-
-	pagevec_init(&pvec, 0);
-	do {
-		int i, num;
-		unsigned long nr_pages;
-
-		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
-					  (pgoff_t)num);
-		if (nr_pages == 0) {
-			if (origin == SEEK_DATA)
-				break;
-
-			BUG_ON(origin != SEEK_HOLE);
-			/*
-			 * If this is the first time to go into the loop and
-			 * offset is not beyond the end offset, it will be a
-			 * hole at this offset
-			 */
-			if (lastoff == startoff || lastoff < endoff)
-				found = 1;
-			break;
-		}
-
-		/*
-		 * If this is the first time to go into the loop and
-		 * offset is smaller than the first page offset, it will be a
-		 * hole at this offset.
-		 */
-		if (lastoff == startoff && origin == SEEK_HOLE &&
-		    lastoff < page_offset(pvec.pages[0])) {
-			found = 1;
-			break;
-		}
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-			struct buffer_head *bh, *head;
-
-			/*
-			 * If the current offset is not beyond the end of given
-			 * range, it will be a hole.
-			 */
-			if (lastoff < endoff && origin == SEEK_HOLE &&
-			    page->index > end) {
-				found = 1;
-				*offset = lastoff;
-				goto out;
-			}
-
-			lock_page(page);
-
-			if (unlikely(page->mapping != inode->i_mapping)) {
-				unlock_page(page);
-				continue;
-			}
-
-			if (!page_has_buffers(page)) {
-				unlock_page(page);
-				continue;
-			}
-
-			if (page_has_buffers(page)) {
-				lastoff = page_offset(page);
-				bh = head = page_buffers(page);
-				do {
-					if (buffer_uptodate(bh) ||
-					    buffer_unwritten(bh)) {
-						if (origin == SEEK_DATA)
-							found = 1;
-					} else {
-						if (origin == SEEK_HOLE)
-							found = 1;
-					}
-					if (found) {
-						*offset = max_t(loff_t,
-							startoff, lastoff);
-						unlock_page(page);
-						goto out;
-					}
-					lastoff += bh->b_size;
-					bh = bh->b_this_page;
-				} while (bh != head);
-			}
-
-			lastoff = page_offset(page) + PAGE_SIZE;
-			unlock_page(page);
-		}
-
-		/*
-		 * The no. of pages is less than our desired, that would be a
-		 * hole in there.
-		 */
-		if (nr_pages < num && origin == SEEK_HOLE) {
-			found = 1;
-			*offset = lastoff;
-			break;
-		}
-
-		index = pvec.pages[i - 1]->index + 1;
-		pagevec_release(&pvec);
-	} while (index <= end);
-
-out:
-	pagevec_release(&pvec);
-	return found;
-}
-
-/*
- * ext4_seek_data() retrieves the offset for SEEK_DATA.
- */
-static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
-{
-	struct inode *inode = file->f_mapping->host;
-	struct ext4_map_blocks map;
-	struct extent_status es;
-	ext4_lblk_t start, last, end;
-	loff_t dataoff, isize;
-	int blkbits;
-	int ret = 0;
-
-	mutex_lock(&inode->i_mutex);
-
-	isize = i_size_read(inode);
-	if (offset >= isize) {
-		mutex_unlock(&inode->i_mutex);
-		return -ENXIO;
-	}
-
-	blkbits = inode->i_sb->s_blocksize_bits;
-	start = offset >> blkbits;
-	last = start;
-	end = isize >> blkbits;
-	dataoff = offset;
-
-	do {
-		map.m_lblk = last;
-		map.m_len = end - last + 1;
-		ret = ext4_map_blocks(NULL, inode, &map, 0);
-		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-			if (last != start)
-				dataoff = last << blkbits;
-			break;
-		}
-
-		/*
-		 * If there is a delay extent at this offset,
-		 * it will be as a data.
-		 */
-		es.start = last;
-		(void)ext4_es_find_extent(inode, &es);
-		if (last >= es.start &&
-		    last < es.start + es.len) {
-			if (last != start)
-				dataoff = last << blkbits;
-			break;
-		}
-
-		/*
-		 * If there is a unwritten extent at this offset,
-		 * it will be as a data or a hole according to page
-		 * cache that has data or not.
-		 */
-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-			int unwritten;
-			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
-							      &map, &dataoff);
-			if (unwritten)
-				break;
-		}
-
-		last++;
-		dataoff = last << blkbits;
-	} while (last <= end);
-
-	mutex_unlock(&inode->i_mutex);
-
-	if (dataoff > isize)
-		return -ENXIO;
-
-	if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		return -EINVAL;
-	if (dataoff > maxsize)
-		return -EINVAL;
-
-	if (dataoff != file->f_pos) {
-		file->f_pos = dataoff;
-		file->f_version = 0;
-	}
-
-	return dataoff;
-}
-
-/*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
- */
-static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
-{
-	struct inode *inode = file->f_mapping->host;
-	struct ext4_map_blocks map;
-	struct extent_status es;
-	ext4_lblk_t start, last, end;
-	loff_t holeoff, isize;
-	int blkbits;
-	int ret = 0;
-
-	mutex_lock(&inode->i_mutex);
-
-	isize = i_size_read(inode);
-	if (offset >= isize) {
-		mutex_unlock(&inode->i_mutex);
-		return -ENXIO;
-	}
-
-	blkbits = inode->i_sb->s_blocksize_bits;
-	start = offset >> blkbits;
-	last = start;
-	end = isize >> blkbits;
-	holeoff = offset;
-
-	do {
-		map.m_lblk = last;
-		map.m_len = end - last + 1;
-		ret = ext4_map_blocks(NULL, inode, &map, 0);
-		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-			last += ret;
-			holeoff = last << blkbits;
-			continue;
-		}
-
-		/*
-		 * If there is a delay extent at this offset,
-		 * we will skip this extent.
-		 */
-		es.start = last;
-		(void)ext4_es_find_extent(inode, &es);
-		if (last >= es.start &&
-		    last < es.start + es.len) {
-			last = es.start + es.len;
-			holeoff = last << blkbits;
-			continue;
-		}
-
-		/*
-		 * If there is a unwritten extent at this offset,
-		 * it will be as a data or a hole according to page
-		 * cache that has data or not.
-		 */
-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-			int unwritten;
-			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-							      &map, &holeoff);
-			if (!unwritten) {
-				last += ret;
-				holeoff = last << blkbits;
-				continue;
-			}
-		}
-
-		/* find a hole */
-		break;
-	} while (last <= end);
-
-	mutex_unlock(&inode->i_mutex);
-
-	if (holeoff > isize)
-		holeoff = isize;
-
-	if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		return -EINVAL;
-	if (holeoff > maxsize)
-		return -EINVAL;
-
-	if (holeoff != file->f_pos) {
-		file->f_pos = holeoff;
-		file->f_version = 0;
-	}
-
-	return holeoff;
-}
-
 /*
  * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
  * by calling generic_file_llseek_size() with the appropriate maxbytes
@@ -619,19 +300,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
 	else
 		maxbytes = inode->i_sb->s_maxbytes;
 
-	switch (origin) {
-	case SEEK_SET:
-	case SEEK_CUR:
-	case SEEK_END:
-		return generic_file_llseek_size(file, offset, origin,
-						maxbytes, i_size_read(inode));
-	case SEEK_DATA:
-		return ext4_seek_data(file, offset, maxbytes);
-	case SEEK_HOLE:
-		return ext4_seek_hole(file, offset, maxbytes);
-	}
-
-	return -EINVAL;
+	return generic_file_llseek_size(file, offset, origin,
+					maxbytes, i_size_read(inode));
 }
 
 const struct file_operations ext4_file_operations = {
@@ -656,10 +326,12 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
+#endif
 	.get_acl	= ext4_get_acl,
 	.fiemap		= ext4_fiemap,
 };
diff --git a/trunk/fs/ext4/fsync.c b/trunk/fs/ext4/fsync.c
index dfbc1fe96674..be1d89f385b4 100644
--- a/trunk/fs/ext4/fsync.c
+++ b/trunk/fs/ext4/fsync.c
@@ -44,6 +44,7 @@
  */
 static int ext4_sync_parent(struct inode *inode)
 {
+	struct writeback_control wbc;
 	struct dentry *dentry = NULL;
 	struct inode *next;
 	int ret = 0;
@@ -65,7 +66,10 @@ static int ext4_sync_parent(struct inode *inode)
 		ret = sync_mapping_buffers(inode->i_mapping);
 		if (ret)
 			break;
-		ret = sync_inode_metadata(inode, 1);
+		memset(&wbc, 0, sizeof(wbc));
+		wbc.sync_mode = WB_SYNC_ALL;
+		wbc.nr_to_write = 0;         /* only write out the inode */
+		ret = sync_inode(inode, &wbc);
 		if (ret)
 			break;
 	}
diff --git a/trunk/fs/ext4/ialloc.c b/trunk/fs/ext4/ialloc.c
index 3f32c8012447..3a100e7a62a8 100644
--- a/trunk/fs/ext4/ialloc.c
+++ b/trunk/fs/ext4/ialloc.c
@@ -762,6 +762,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
 
 		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
 		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+		brelse(block_bitmap_bh);
 
 		/* recheck and clear flag under lock if we still need to */
 		ext4_lock_group(sb, group);
@@ -774,7 +775,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
 			ext4_group_desc_csum_set(sb, group, gdp);
 		}
 		ext4_unlock_group(sb, group);
-		brelse(block_bitmap_bh);
 
 		if (err)
 			goto fail;
@@ -902,10 +902,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
-	ei->i_inline_off = 0;
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
-		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-
 	ret = inode;
 	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
diff --git a/trunk/fs/ext4/indirect.c b/trunk/fs/ext4/indirect.c
index 20862f96e8ae..792e388e7b44 100644
--- a/trunk/fs/ext4/indirect.c
+++ b/trunk/fs/ext4/indirect.c
@@ -22,7 +22,6 @@
 
 #include "ext4_jbd2.h"
 #include "truncate.h"
-#include "ext4_extents.h"	/* Needed for EXT_MAX_BLOCKS */
 
 #include <trace/events/ext4.h>
 
@@ -756,7 +755,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 		partial--;
 	}
 out:
-	trace_ext4_ind_map_blocks_exit(inode, map, err);
+	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+				map->m_pblk, map->m_len, err);
 	return err;
 }
 
@@ -1412,7 +1412,6 @@ void ext4_ind_truncate(struct inode *inode)
 	down_write(&ei->i_data_sem);
 
 	ext4_discard_preallocations(inode);
-	ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
 
 	/*
 	 * The orphan list entry will now protect us from any crash which
diff --git a/trunk/fs/ext4/inline.c b/trunk/fs/ext4/inline.c
deleted file mode 100644
index 387c47c6cda9..000000000000
--- a/trunk/fs/ext4/inline.c
+++ /dev/null
@@ -1,1884 +0,0 @@
-/*
- * Copyright (c) 2012 Taobao.
- * Written by Tao Ma <boyu.mt@taobao.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#include "ext4_jbd2.h"
-#include "ext4.h"
-#include "xattr.h"
-#include "truncate.h"
-#include <linux/fiemap.h>
-
-#define EXT4_XATTR_SYSTEM_DATA	"data"
-#define EXT4_MIN_INLINE_DATA_SIZE	((sizeof(__le32) * EXT4_N_BLOCKS))
-#define EXT4_INLINE_DOTDOT_SIZE	4
-
-int ext4_get_inline_size(struct inode *inode)
-{
-	if (EXT4_I(inode)->i_inline_off)
-		return EXT4_I(inode)->i_inline_size;
-
-	return 0;
-}
-
-static int get_max_inline_xattr_value_size(struct inode *inode,
-					   struct ext4_iloc *iloc)
-{
-	struct ext4_xattr_ibody_header *header;
-	struct ext4_xattr_entry *entry;
-	struct ext4_inode *raw_inode;
-	int free, min_offs;
-
-	min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
-			EXT4_GOOD_OLD_INODE_SIZE -
-			EXT4_I(inode)->i_extra_isize -
-			sizeof(struct ext4_xattr_ibody_header);
-
-	/*
-	 * We need to subtract another sizeof(__u32) since an in-inode xattr
-	 * needs an empty 4 bytes to indicate the gap between the xattr entry
-	 * and the name/value pair.
-	 */
-	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
-		return EXT4_XATTR_SIZE(min_offs -
-			EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
-			EXT4_XATTR_ROUND - sizeof(__u32));
-
-	raw_inode = ext4_raw_inode(iloc);
-	header = IHDR(inode, raw_inode);
-	entry = IFIRST(header);
-
-	/* Compute min_offs. */
-	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-		if (!entry->e_value_block && entry->e_value_size) {
-			size_t offs = le16_to_cpu(entry->e_value_offs);
-			if (offs < min_offs)
-				min_offs = offs;
-		}
-	}
-	free = min_offs -
-		((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
-
-	if (EXT4_I(inode)->i_inline_off) {
-		entry = (struct ext4_xattr_entry *)
-			((void *)raw_inode + EXT4_I(inode)->i_inline_off);
-
-		free += le32_to_cpu(entry->e_value_size);
-		goto out;
-	}
-
-	free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
-
-	if (free > EXT4_XATTR_ROUND)
-		free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
-	else
-		free = 0;
-
-out:
-	return free;
-}
-
-/*
- * Get the maximum size we now can store in an inode.
- * If we can't find the space for a xattr entry, don't use the space
- * of the extents since we have no space to indicate the inline data.
- */
-int ext4_get_max_inline_size(struct inode *inode)
-{
-	int error, max_inline_size;
-	struct ext4_iloc iloc;
-
-	if (EXT4_I(inode)->i_extra_isize == 0)
-		return 0;
-
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error) {
-		ext4_error_inode(inode, __func__, __LINE__, 0,
-				 "can't get inode location %lu",
-				 inode->i_ino);
-		return 0;
-	}
-
-	down_read(&EXT4_I(inode)->xattr_sem);
-	max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
-	up_read(&EXT4_I(inode)->xattr_sem);
-
-	brelse(iloc.bh);
-
-	if (!max_inline_size)
-		return 0;
-
-	return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
-}
-
-int ext4_has_inline_data(struct inode *inode)
-{
-	return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
-	       EXT4_I(inode)->i_inline_off;
-}
-
-/*
- * this function does not take xattr_sem, which is OK because it is
- * currently only used in a code path coming form ext4_iget, before
- * the new inode has been unlocked
- */
-int ext4_find_inline_data_nolock(struct inode *inode)
-{
-	struct ext4_xattr_ibody_find is = {
-		.s = { .not_found = -ENODATA, },
-	};
-	struct ext4_xattr_info i = {
-		.name_index = EXT4_XATTR_INDEX_SYSTEM,
-		.name = EXT4_XATTR_SYSTEM_DATA,
-	};
-	int error;
-
-	if (EXT4_I(inode)->i_extra_isize == 0)
-		return 0;
-
-	error = ext4_get_inode_loc(inode, &is.iloc);
-	if (error)
-		return error;
-
-	error = ext4_xattr_ibody_find(inode, &i, &is);
-	if (error)
-		goto out;
-
-	if (!is.s.not_found) {
-		EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
-					(void *)ext4_raw_inode(&is.iloc));
-		EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
-				le32_to_cpu(is.s.here->e_value_size);
-		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-	}
-out:
-	brelse(is.iloc.bh);
-	return error;
-}
-
-static int ext4_read_inline_data(struct inode *inode, void *buffer,
-				 unsigned int len,
-				 struct ext4_iloc *iloc)
-{
-	struct ext4_xattr_entry *entry;
-	struct ext4_xattr_ibody_header *header;
-	int cp_len = 0;
-	struct ext4_inode *raw_inode;
-
-	if (!len)
-		return 0;
-
-	BUG_ON(len > EXT4_I(inode)->i_inline_size);
-
-	cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
-			len : EXT4_MIN_INLINE_DATA_SIZE;
-
-	raw_inode = ext4_raw_inode(iloc);
-	memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
-
-	len -= cp_len;
-	buffer += cp_len;
-
-	if (!len)
-		goto out;
-
-	header = IHDR(inode, raw_inode);
-	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
-					    EXT4_I(inode)->i_inline_off);
-	len = min_t(unsigned int, len,
-		    (unsigned int)le32_to_cpu(entry->e_value_size));
-
-	memcpy(buffer,
-	       (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
-	cp_len += len;
-
-out:
-	return cp_len;
-}
-
-/*
- * write the buffer to the inline inode.
- * If 'create' is set, we don't need to do the extra copy in the xattr
- * value since it is already handled by ext4_xattr_ibody_inline_set.
- * That saves us one memcpy.
- */
-void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
-			    void *buffer, loff_t pos, unsigned int len)
-{
-	struct ext4_xattr_entry *entry;
-	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
-	int cp_len = 0;
-
-	BUG_ON(!EXT4_I(inode)->i_inline_off);
-	BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
-
-	raw_inode = ext4_raw_inode(iloc);
-	buffer += pos;
-
-	if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
-		cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
-			 EXT4_MIN_INLINE_DATA_SIZE - pos : len;
-		memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
-
-		len -= cp_len;
-		buffer += cp_len;
-		pos += cp_len;
-	}
-
-	if (!len)
-		return;
-
-	pos -= EXT4_MIN_INLINE_DATA_SIZE;
-	header = IHDR(inode, raw_inode);
-	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
-					    EXT4_I(inode)->i_inline_off);
-
-	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
-	       buffer, len);
-}
-
-static int ext4_create_inline_data(handle_t *handle,
-				   struct inode *inode, unsigned len)
-{
-	int error;
-	void *value = NULL;
-	struct ext4_xattr_ibody_find is = {
-		.s = { .not_found = -ENODATA, },
-	};
-	struct ext4_xattr_info i = {
-		.name_index = EXT4_XATTR_INDEX_SYSTEM,
-		.name = EXT4_XATTR_SYSTEM_DATA,
-	};
-
-	error = ext4_get_inode_loc(inode, &is.iloc);
-	if (error)
-		return error;
-
-	error = ext4_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto out;
-
-	if (len > EXT4_MIN_INLINE_DATA_SIZE) {
-		value = EXT4_ZERO_XATTR_VALUE;
-		len -= EXT4_MIN_INLINE_DATA_SIZE;
-	} else {
-		value = "";
-		len = 0;
-	}
-
-	/* Insert the the xttr entry. */
-	i.value = value;
-	i.value_len = len;
-
-	error = ext4_xattr_ibody_find(inode, &i, &is);
-	if (error)
-		goto out;
-
-	BUG_ON(!is.s.not_found);
-
-	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
-	if (error) {
-		if (error == -ENOSPC)
-			ext4_clear_inode_state(inode,
-					       EXT4_STATE_MAY_INLINE_DATA);
-		goto out;
-	}
-
-	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
-		0, EXT4_MIN_INLINE_DATA_SIZE);
-
-	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
-				      (void *)ext4_raw_inode(&is.iloc));
-	EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
-	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
-	ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
-	get_bh(is.iloc.bh);
-	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
-
-out:
-	brelse(is.iloc.bh);
-	return error;
-}
-
-static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
-				   unsigned int len)
-{
-	int error;
-	void *value = NULL;
-	struct ext4_xattr_ibody_find is = {
-		.s = { .not_found = -ENODATA, },
-	};
-	struct ext4_xattr_info i = {
-		.name_index = EXT4_XATTR_INDEX_SYSTEM,
-		.name = EXT4_XATTR_SYSTEM_DATA,
-	};
-
-	/* If the old space is ok, write the data directly. */
-	if (len <= EXT4_I(inode)->i_inline_size)
-		return 0;
-
-	error = ext4_get_inode_loc(inode, &is.iloc);
-	if (error)
-		return error;
-
-	error = ext4_xattr_ibody_find(inode, &i, &is);
-	if (error)
-		goto out;
-
-	BUG_ON(is.s.not_found);
-
-	len -= EXT4_MIN_INLINE_DATA_SIZE;
-	value = kzalloc(len, GFP_NOFS);
-	if (!value)
-		goto out;
-
-	error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
-				     value, len);
-	if (error == -ENODATA)
-		goto out;
-
-	error = ext4_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto out;
-
-	/* Update the xttr entry. */
-	i.value = value;
-	i.value_len = len;
-
-	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
-	if (error)
-		goto out;
-
-	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
-				      (void *)ext4_raw_inode(&is.iloc));
-	EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
-				le32_to_cpu(is.s.here->e_value_size);
-	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-	get_bh(is.iloc.bh);
-	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
-
-out:
-	kfree(value);
-	brelse(is.iloc.bh);
-	return error;
-}
-
-int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
-			     unsigned int len)
-{
-	int ret, size;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-
-	if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
-		return -ENOSPC;
-
-	size = ext4_get_max_inline_size(inode);
-	if (size < len)
-		return -ENOSPC;
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-
-	if (ei->i_inline_off)
-		ret = ext4_update_inline_data(handle, inode, len);
-	else
-		ret = ext4_create_inline_data(handle, inode, len);
-
-	up_write(&EXT4_I(inode)->xattr_sem);
-
-	return ret;
-}
-
-static int ext4_destroy_inline_data_nolock(handle_t *handle,
-					   struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_xattr_ibody_find is = {
-		.s = { .not_found = 0, },
-	};
-	struct ext4_xattr_info i = {
-		.name_index = EXT4_XATTR_INDEX_SYSTEM,
-		.name = EXT4_XATTR_SYSTEM_DATA,
-		.value = NULL,
-		.value_len = 0,
-	};
-	int error;
-
-	if (!ei->i_inline_off)
-		return 0;
-
-	error = ext4_get_inode_loc(inode, &is.iloc);
-	if (error)
-		return error;
-
-	error = ext4_xattr_ibody_find(inode, &i, &is);
-	if (error)
-		goto out;
-
-	error = ext4_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto out;
-
-	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
-	if (error)
-		goto out;
-
-	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
-		0, EXT4_MIN_INLINE_DATA_SIZE);
-
-	if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
-				      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-		if (S_ISDIR(inode->i_mode) ||
-		    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
-			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
-			ext4_ext_tree_init(handle, inode);
-		}
-	}
-	ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
-
-	get_bh(is.iloc.bh);
-	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
-
-	EXT4_I(inode)->i_inline_off = 0;
-	EXT4_I(inode)->i_inline_size = 0;
-	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-out:
-	brelse(is.iloc.bh);
-	if (error == -ENODATA)
-		error = 0;
-	return error;
-}
-
-static int ext4_read_inline_page(struct inode *inode, struct page *page)
-{
-	void *kaddr;
-	int ret = 0;
-	size_t len;
-	struct ext4_iloc iloc;
-
-	BUG_ON(!PageLocked(page));
-	BUG_ON(!ext4_has_inline_data(inode));
-	BUG_ON(page->index);
-
-	if (!EXT4_I(inode)->i_inline_off) {
-		ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
-			     inode->i_ino);
-		goto out;
-	}
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		goto out;
-
-	len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
-	kaddr = kmap_atomic(page);
-	ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr);
-	zero_user_segment(page, len, PAGE_CACHE_SIZE);
-	SetPageUptodate(page);
-	brelse(iloc.bh);
-
-out:
-	return ret;
-}
-
-int ext4_readpage_inline(struct inode *inode, struct page *page)
-{
-	int ret = 0;
-
-	down_read(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		up_read(&EXT4_I(inode)->xattr_sem);
-		return -EAGAIN;
-	}
-
-	/*
-	 * Current inline data can only exist in the 1st page,
-	 * So for all the other pages, just set them uptodate.
-	 */
-	if (!page->index)
-		ret = ext4_read_inline_page(inode, page);
-	else if (!PageUptodate(page)) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
-		SetPageUptodate(page);
-	}
-
-	up_read(&EXT4_I(inode)->xattr_sem);
-
-	unlock_page(page);
-	return ret >= 0 ? 0 : ret;
-}
-
-static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
-					      struct inode *inode,
-					      unsigned flags)
-{
-	int ret, needed_blocks;
-	handle_t *handle = NULL;
-	int retries = 0, sem_held = 0;
-	struct page *page = NULL;
-	unsigned from, to;
-	struct ext4_iloc iloc;
-
-	if (!ext4_has_inline_data(inode)) {
-		/*
-		 * clear the flag so that no new write
-		 * will trap here again.
-		 */
-		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-		return 0;
-	}
-
-	needed_blocks = ext4_writepage_trans_blocks(inode);
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
-
-retry:
-	handle = ext4_journal_start(inode, needed_blocks);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	/* We cannot recurse into the filesystem as the transaction is already
-	 * started */
-	flags |= AOP_FLAG_NOFS;
-
-	page = grab_cache_page_write_begin(mapping, 0, flags);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-	sem_held = 1;
-	/* If some one has already done this for us, just exit. */
-	if (!ext4_has_inline_data(inode)) {
-		ret = 0;
-		goto out;
-	}
-
-	from = 0;
-	to = ext4_get_inline_size(inode);
-	if (!PageUptodate(page)) {
-		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = ext4_destroy_inline_data_nolock(handle, inode);
-	if (ret)
-		goto out;
-
-	if (ext4_should_dioread_nolock(inode))
-		ret = __block_write_begin(page, from, to, ext4_get_block_write);
-	else
-		ret = __block_write_begin(page, from, to, ext4_get_block);
-
-	if (!ret && ext4_should_journal_data(inode)) {
-		ret = ext4_walk_page_buffers(handle, page_buffers(page),
-					     from, to, NULL,
-					     do_journal_get_write_access);
-	}
-
-	if (ret) {
-		unlock_page(page);
-		page_cache_release(page);
-		ext4_orphan_add(handle, inode);
-		up_write(&EXT4_I(inode)->xattr_sem);
-		sem_held = 0;
-		ext4_journal_stop(handle);
-		handle = NULL;
-		ext4_truncate_failed_write(inode);
-		/*
-		 * If truncate failed early the inode might
-		 * still be on the orphan list; we need to
-		 * make sure the inode is removed from the
-		 * orphan list in that case.
-		 */
-		if (inode->i_nlink)
-			ext4_orphan_del(NULL, inode);
-	}
-
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-
-	block_commit_write(page, from, to);
-out:
-	if (page) {
-		unlock_page(page);
-		page_cache_release(page);
-	}
-	if (sem_held)
-		up_write(&EXT4_I(inode)->xattr_sem);
-	if (handle)
-		ext4_journal_stop(handle);
-	brelse(iloc.bh);
-	return ret;
-}
-
-/*
- * Try to write data in the inode.
- * If the inode has inline data, check whether the new write can be
- * in the inode also. If not, create the page the handle, move the data
- * to the page make it update and let the later codes create extent for it.
- */
-int ext4_try_to_write_inline_data(struct address_space *mapping,
-				  struct inode *inode,
-				  loff_t pos, unsigned len,
-				  unsigned flags,
-				  struct page **pagep)
-{
-	int ret;
-	handle_t *handle;
-	struct page *page;
-	struct ext4_iloc iloc;
-
-	if (pos + len > ext4_get_max_inline_size(inode))
-		goto convert;
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
-
-	/*
-	 * The possible write could happen in the inode,
-	 * so try to reserve the space in inode first.
-	 */
-	handle = ext4_journal_start(inode, 1);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	ret = ext4_prepare_inline_data(handle, inode, pos + len);
-	if (ret && ret != -ENOSPC)
-		goto out;
-
-	/* We don't have space in inline inode, so convert it to extent. */
-	if (ret == -ENOSPC) {
-		ext4_journal_stop(handle);
-		brelse(iloc.bh);
-		goto convert;
-	}
-
-	flags |= AOP_FLAG_NOFS;
-
-	page = grab_cache_page_write_begin(mapping, 0, flags);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	*pagep = page;
-	down_read(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		ret = 0;
-		unlock_page(page);
-		page_cache_release(page);
-		goto out_up_read;
-	}
-
-	if (!PageUptodate(page)) {
-		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
-			goto out_up_read;
-	}
-
-	ret = 1;
-	handle = NULL;
-out_up_read:
-	up_read(&EXT4_I(inode)->xattr_sem);
-out:
-	if (handle)
-		ext4_journal_stop(handle);
-	brelse(iloc.bh);
-	return ret;
-convert:
-	return ext4_convert_inline_data_to_extent(mapping,
-						  inode, flags);
-}
-
-int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
-			       unsigned copied, struct page *page)
-{
-	int ret;
-	void *kaddr;
-	struct ext4_iloc iloc;
-
-	if (unlikely(copied < len)) {
-		if (!PageUptodate(page)) {
-			copied = 0;
-			goto out;
-		}
-	}
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret) {
-		ext4_std_error(inode->i_sb, ret);
-		copied = 0;
-		goto out;
-	}
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-	BUG_ON(!ext4_has_inline_data(inode));
-
-	kaddr = kmap_atomic(page);
-	ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
-	kunmap_atomic(kaddr);
-	SetPageUptodate(page);
-	/* clear page dirty so that writepages wouldn't work for us. */
-	ClearPageDirty(page);
-
-	up_write(&EXT4_I(inode)->xattr_sem);
-	brelse(iloc.bh);
-out:
-	return copied;
-}
-
-struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
-				  unsigned len,
-				  struct page *page)
-{
-	int ret;
-	void *kaddr;
-	struct ext4_iloc iloc;
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret) {
-		ext4_std_error(inode->i_sb, ret);
-		return NULL;
-	}
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-	kaddr = kmap_atomic(page);
-	ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
-	kunmap_atomic(kaddr);
-	up_write(&EXT4_I(inode)->xattr_sem);
-
-	return iloc.bh;
-}
-
-/*
- * Try to make the page cache and handle ready for the inline data case.
- * We can call this function in 2 cases:
- * 1. The inode is created and the first write exceeds inline size. We can
- *    clear the inode state safely.
- * 2. The inode has inline data, then we need to read the data, make it
- *    update and dirty so that ext4_da_writepages can handle it. We don't
- *    need to start the journal since the file's metatdata isn't changed now.
- */
-static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
-						 struct inode *inode,
-						 unsigned flags,
-						 void **fsdata)
-{
-	int ret = 0, inline_size;
-	struct page *page;
-
-	page = grab_cache_page_write_begin(mapping, 0, flags);
-	if (!page)
-		return -ENOMEM;
-
-	down_read(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-		goto out;
-	}
-
-	inline_size = ext4_get_inline_size(inode);
-
-	if (!PageUptodate(page)) {
-		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = __block_write_begin(page, 0, inline_size,
-				  ext4_da_get_block_prep);
-	if (ret) {
-		ext4_truncate_failed_write(inode);
-		goto out;
-	}
-
-	SetPageDirty(page);
-	SetPageUptodate(page);
-	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-	*fsdata = (void *)CONVERT_INLINE_DATA;
-
-out:
-	up_read(&EXT4_I(inode)->xattr_sem);
-	if (page) {
-		unlock_page(page);
-		page_cache_release(page);
-	}
-	return ret;
-}
-
-/*
- * Prepare the write for the inline data.
- * If the the data can be written into the inode, we just read
- * the page and make it uptodate, and start the journal.
- * Otherwise read the page, makes it dirty so that it can be
- * handle in writepages(the i_disksize update is left to the
- * normal ext4_da_write_end).
- */
-int ext4_da_write_inline_data_begin(struct address_space *mapping,
-				    struct inode *inode,
-				    loff_t pos, unsigned len,
-				    unsigned flags,
-				    struct page **pagep,
-				    void **fsdata)
-{
-	int ret, inline_size;
-	handle_t *handle;
-	struct page *page;
-	struct ext4_iloc iloc;
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
-
-	handle = ext4_journal_start(inode, 1);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	inline_size = ext4_get_max_inline_size(inode);
-
-	ret = -ENOSPC;
-	if (inline_size >= pos + len) {
-		ret = ext4_prepare_inline_data(handle, inode, pos + len);
-		if (ret && ret != -ENOSPC)
-			goto out;
-	}
-
-	if (ret == -ENOSPC) {
-		ret = ext4_da_convert_inline_data_to_extent(mapping,
-							    inode,
-							    flags,
-							    fsdata);
-		goto out;
-	}
-
-	/*
-	 * We cannot recurse into the filesystem as the transaction
-	 * is already started.
-	 */
-	flags |= AOP_FLAG_NOFS;
-
-	page = grab_cache_page_write_begin(mapping, 0, flags);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	down_read(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		ret = 0;
-		goto out_release_page;
-	}
-
-	if (!PageUptodate(page)) {
-		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
-			goto out_release_page;
-	}
-
-	up_read(&EXT4_I(inode)->xattr_sem);
-	*pagep = page;
-	handle = NULL;
-	brelse(iloc.bh);
-	return 1;
-out_release_page:
-	up_read(&EXT4_I(inode)->xattr_sem);
-	unlock_page(page);
-	page_cache_release(page);
-out:
-	if (handle)
-		ext4_journal_stop(handle);
-	brelse(iloc.bh);
-	return ret;
-}
-
-int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
-				  unsigned len, unsigned copied,
-				  struct page *page)
-{
-	int i_size_changed = 0;
-
-	copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
-
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
-	 *
-	 * But it's important to update i_size while still holding page lock:
-	 * page writeout could otherwise come in and zero beyond i_size.
-	 */
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		i_size_changed = 1;
-	}
-	unlock_page(page);
-	page_cache_release(page);
-
-	/*
-	 * Don't mark the inode dirty under page lock. First, it unnecessarily
-	 * makes the holding time of page lock longer. Second, it forces lock
-	 * ordering of page lock and transaction start for journaling
-	 * filesystems.
-	 */
-	if (i_size_changed)
-		mark_inode_dirty(inode);
-
-	return copied;
-}
-
-#ifdef INLINE_DIR_DEBUG
-void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
-			  void *inline_start, int inline_size)
-{
-	int offset;
-	unsigned short de_len;
-	struct ext4_dir_entry_2 *de = inline_start;
-	void *dlimit = inline_start + inline_size;
-
-	trace_printk("inode %lu\n", dir->i_ino);
-	offset = 0;
-	while ((void *)de < dlimit) {
-		de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
-		trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
-			     offset, de_len, de->name_len, de->name,
-			     de->name_len, le32_to_cpu(de->inode));
-		if (ext4_check_dir_entry(dir, NULL, de, bh,
-					 inline_start, inline_size, offset))
-			BUG();
-
-		offset += de_len;
-		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
-	}
-}
-#else
-#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
-#endif
-
-/*
- * Add a new entry into a inline dir.
- * It will return -ENOSPC if no space is available, and -EIO
- * and -EEXIST if directory entry already exists.
- */
-static int ext4_add_dirent_to_inline(handle_t *handle,
-				     struct dentry *dentry,
-				     struct inode *inode,
-				     struct ext4_iloc *iloc,
-				     void *inline_start, int inline_size)
-{
-	struct inode	*dir = dentry->d_parent->d_inode;
-	const char	*name = dentry->d_name.name;
-	int		namelen = dentry->d_name.len;
-	unsigned short	reclen;
-	int		err;
-	struct ext4_dir_entry_2 *de;
-
-	reclen = EXT4_DIR_REC_LEN(namelen);
-	err = ext4_find_dest_de(dir, inode, iloc->bh,
-				inline_start, inline_size,
-				name, namelen, &de);
-	if (err)
-		return err;
-
-	err = ext4_journal_get_write_access(handle, iloc->bh);
-	if (err)
-		return err;
-	ext4_insert_dentry(inode, de, inline_size, name, namelen);
-
-	ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
-
-	/*
-	 * XXX shouldn't update any times until successful
-	 * completion of syscall, but too many callers depend
-	 * on this.
-	 *
-	 * XXX similarly, too many callers depend on
-	 * ext4_new_inode() setting the times, but error
-	 * recovery deletes the inode, so the worst that can
-	 * happen is that the times are slightly out of date
-	 * and/or different from the directory change time.
-	 */
-	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
-	ext4_update_dx_flag(dir);
-	dir->i_version++;
-	ext4_mark_inode_dirty(handle, dir);
-	return 1;
-}
-
-static void *ext4_get_inline_xattr_pos(struct inode *inode,
-				       struct ext4_iloc *iloc)
-{
-	struct ext4_xattr_entry *entry;
-	struct ext4_xattr_ibody_header *header;
-
-	BUG_ON(!EXT4_I(inode)->i_inline_off);
-
-	header = IHDR(inode, ext4_raw_inode(iloc));
-	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
-					    EXT4_I(inode)->i_inline_off);
-
-	return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
-}
-
-/* Set the final de to cover the whole block. */
-static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
-{
-	struct ext4_dir_entry_2 *de, *prev_de;
-	void *limit;
-	int de_len;
-
-	de = (struct ext4_dir_entry_2 *)de_buf;
-	if (old_size) {
-		limit = de_buf + old_size;
-		do {
-			prev_de = de;
-			de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
-			de_buf += de_len;
-			de = (struct ext4_dir_entry_2 *)de_buf;
-		} while (de_buf < limit);
-
-		prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
-							old_size, new_size);
-	} else {
-		/* this is just created, so create an empty entry. */
-		de->inode = 0;
-		de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
-	}
-}
-
-static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
-				  struct ext4_iloc *iloc)
-{
-	int ret;
-	int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
-	int new_size = get_max_inline_xattr_value_size(dir, iloc);
-
-	if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
-		return -ENOSPC;
-
-	ret = ext4_update_inline_data(handle, dir,
-				      new_size + EXT4_MIN_INLINE_DATA_SIZE);
-	if (ret)
-		return ret;
-
-	ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
-			     EXT4_I(dir)->i_inline_size -
-						EXT4_MIN_INLINE_DATA_SIZE);
-	dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
-	return 0;
-}
-
-static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
-				     struct ext4_iloc *iloc,
-				     void *buf, int inline_size)
-{
-	ext4_create_inline_data(handle, inode, inline_size);
-	ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
-	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-}
-
-static int ext4_finish_convert_inline_dir(handle_t *handle,
-					  struct inode *inode,
-					  struct buffer_head *dir_block,
-					  void *buf,
-					  int inline_size)
-{
-	int err, csum_size = 0, header_size = 0;
-	struct ext4_dir_entry_2 *de;
-	struct ext4_dir_entry_tail *t;
-	void *target = dir_block->b_data;
-
-	/*
-	 * First create "." and ".." and then copy the dir information
-	 * back to the block.
-	 */
-	de = (struct ext4_dir_entry_2 *)target;
-	de = ext4_init_dot_dotdot(inode, de,
-		inode->i_sb->s_blocksize, csum_size,
-		le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
-	header_size = (void *)de - target;
-
-	memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
-		inline_size - EXT4_INLINE_DOTDOT_SIZE);
-
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
-		csum_size = sizeof(struct ext4_dir_entry_tail);
-
-	inode->i_size = inode->i_sb->s_blocksize;
-	i_size_write(inode, inode->i_sb->s_blocksize);
-	EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	ext4_update_final_de(dir_block->b_data,
-			inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
-			inode->i_sb->s_blocksize - csum_size);
-
-	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(dir_block->b_data,
-				     inode->i_sb->s_blocksize);
-		initialize_dirent_tail(t, inode->i_sb->s_blocksize);
-	}
-	set_buffer_uptodate(dir_block);
-	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
-	if (err)
-		goto out;
-	set_buffer_verified(dir_block);
-out:
-	return err;
-}
-
-static int ext4_convert_inline_data_nolock(handle_t *handle,
-					   struct inode *inode,
-					   struct ext4_iloc *iloc)
-{
-	int error;
-	void *buf = NULL;
-	struct buffer_head *data_bh = NULL;
-	struct ext4_map_blocks map;
-	int inline_size;
-
-	inline_size = ext4_get_inline_size(inode);
-	buf = kmalloc(inline_size, GFP_NOFS);
-	if (!buf) {
-		error = -ENOMEM;
-		goto out;
-	}
-
-	error = ext4_read_inline_data(inode, buf, inline_size, iloc);
-	if (error < 0)
-		goto out;
-
-	error = ext4_destroy_inline_data_nolock(handle, inode);
-	if (error)
-		goto out;
-
-	map.m_lblk = 0;
-	map.m_len = 1;
-	map.m_flags = 0;
-	error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
-	if (error < 0)
-		goto out_restore;
-	if (!(map.m_flags & EXT4_MAP_MAPPED)) {
-		error = -EIO;
-		goto out_restore;
-	}
-
-	data_bh = sb_getblk(inode->i_sb, map.m_pblk);
-	if (!data_bh) {
-		error = -EIO;
-		goto out_restore;
-	}
-
-	lock_buffer(data_bh);
-	error = ext4_journal_get_create_access(handle, data_bh);
-	if (error) {
-		unlock_buffer(data_bh);
-		error = -EIO;
-		goto out_restore;
-	}
-	memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
-
-	if (!S_ISDIR(inode->i_mode)) {
-		memcpy(data_bh->b_data, buf, inline_size);
-		set_buffer_uptodate(data_bh);
-		error = ext4_handle_dirty_metadata(handle,
-						   inode, data_bh);
-	} else {
-		error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
-						       buf, inline_size);
-	}
-
-	unlock_buffer(data_bh);
-out_restore:
-	if (error)
-		ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
-
-out:
-	brelse(data_bh);
-	kfree(buf);
-	return error;
-}
-
-/*
- * Try to add the new entry to the inline data.
- * If succeeds, return 0. If not, extended the inline dir and copied data to
- * the new created block.
- */
-int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
-			      struct inode *inode)
-{
-	int ret, inline_size;
-	void *inline_start;
-	struct ext4_iloc iloc;
-	struct inode *dir = dentry->d_parent->d_inode;
-
-	ret = ext4_get_inode_loc(dir, &iloc);
-	if (ret)
-		return ret;
-
-	down_write(&EXT4_I(dir)->xattr_sem);
-	if (!ext4_has_inline_data(dir))
-		goto out;
-
-	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
-						 EXT4_INLINE_DOTDOT_SIZE;
-	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
-
-	ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
-					inline_start, inline_size);
-	if (ret != -ENOSPC)
-		goto out;
-
-	/* check whether it can be inserted to inline xattr space. */
-	inline_size = EXT4_I(dir)->i_inline_size -
-			EXT4_MIN_INLINE_DATA_SIZE;
-	if (!inline_size) {
-		/* Try to use the xattr space.*/
-		ret = ext4_update_inline_dir(handle, dir, &iloc);
-		if (ret && ret != -ENOSPC)
-			goto out;
-
-		inline_size = EXT4_I(dir)->i_inline_size -
-				EXT4_MIN_INLINE_DATA_SIZE;
-	}
-
-	if (inline_size) {
-		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
-
-		ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
-						inline_start, inline_size);
-
-		if (ret != -ENOSPC)
-			goto out;
-	}
-
-	/*
-	 * The inline space is filled up, so create a new block for it.
-	 * As the extent tree will be created, we have to save the inline
-	 * dir first.
-	 */
-	ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
-
-out:
-	ext4_mark_inode_dirty(handle, dir);
-	up_write(&EXT4_I(dir)->xattr_sem);
-	brelse(iloc.bh);
-	return ret;
-}
-
-int ext4_read_inline_dir(struct file *filp,
-			 void *dirent, filldir_t filldir,
-			 int *has_inline_data)
-{
-	int error = 0;
-	unsigned int offset, parent_ino;
-	int i, stored;
-	struct ext4_dir_entry_2 *de;
-	struct super_block *sb;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int ret, inline_size = 0;
-	struct ext4_iloc iloc;
-	void *dir_buf = NULL;
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
-
-	down_read(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		up_read(&EXT4_I(inode)->xattr_sem);
-		*has_inline_data = 0;
-		goto out;
-	}
-
-	inline_size = ext4_get_inline_size(inode);
-	dir_buf = kmalloc(inline_size, GFP_NOFS);
-	if (!dir_buf) {
-		ret = -ENOMEM;
-		up_read(&EXT4_I(inode)->xattr_sem);
-		goto out;
-	}
-
-	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
-	up_read(&EXT4_I(inode)->xattr_sem);
-	if (ret < 0)
-		goto out;
-
-	sb = inode->i_sb;
-	stored = 0;
-	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
-
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-revalidate:
-		/*
-		 * If the version has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the inline
-		 * dir to make sure.
-		 */
-		if (filp->f_version != inode->i_version) {
-			for (i = 0;
-			     i < inode->i_size && i < offset;) {
-				if (!i) {
-					/* skip "." and ".." if needed. */
-					i += EXT4_INLINE_DOTDOT_SIZE;
-					continue;
-				}
-				de = (struct ext4_dir_entry_2 *)
-					(dir_buf + i);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len,
-					inline_size) < EXT4_DIR_REC_LEN(1))
-					break;
-				i += ext4_rec_len_from_disk(de->rec_len,
-							    inline_size);
-			}
-			offset = i;
-			filp->f_pos = offset;
-			filp->f_version = inode->i_version;
-		}
-
-		while (!error && filp->f_pos < inode->i_size) {
-			if (filp->f_pos == 0) {
-				error = filldir(dirent, ".", 1, 0, inode->i_ino,
-						DT_DIR);
-				if (error)
-					break;
-				stored++;
-
-				error = filldir(dirent, "..", 2, 0, parent_ino,
-						DT_DIR);
-				if (error)
-					break;
-				stored++;
-
-				filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
-				continue;
-			}
-
-			de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
-			if (ext4_check_dir_entry(inode, filp, de,
-						 iloc.bh, dir_buf,
-						 inline_size, offset)) {
-				ret = stored;
-				goto out;
-			}
-			offset += ext4_rec_len_from_disk(de->rec_len,
-							 inline_size);
-			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
-						de->name_len,
-						filp->f_pos,
-						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored++;
-			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-							      inline_size);
-		}
-		offset = 0;
-	}
-out:
-	kfree(dir_buf);
-	brelse(iloc.bh);
-	return ret;
-}
-
-struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
-					struct ext4_dir_entry_2 **parent_de,
-					int *retval)
-{
-	struct ext4_iloc iloc;
-
-	*retval = ext4_get_inode_loc(inode, &iloc);
-	if (*retval)
-		return NULL;
-
-	*parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
-
-	return iloc.bh;
-}
-
-/*
- * Try to create the inline data for the new dir.
- * If it succeeds, return 0, otherwise return the error.
- * In case of ENOSPC, the caller should create the normal disk layout dir.
- */
-int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
-			       struct inode *inode)
-{
-	int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
-	struct ext4_iloc iloc;
-	struct ext4_dir_entry_2 *de;
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
-
-	ret = ext4_prepare_inline_data(handle, inode, inline_size);
-	if (ret)
-		goto out;
-
-	/*
-	 * For inline dir, we only save the inode information for the ".."
-	 * and create a fake dentry to cover the left space.
-	 */
-	de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
-	de->inode = cpu_to_le32(parent->i_ino);
-	de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
-	de->inode = 0;
-	de->rec_len = ext4_rec_len_to_disk(
-				inline_size - EXT4_INLINE_DOTDOT_SIZE,
-				inline_size);
-	set_nlink(inode, 2);
-	inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
-out:
-	brelse(iloc.bh);
-	return ret;
-}
-
-struct buffer_head *ext4_find_inline_entry(struct inode *dir,
-					const struct qstr *d_name,
-					struct ext4_dir_entry_2 **res_dir,
-					int *has_inline_data)
-{
-	int ret;
-	struct ext4_iloc iloc;
-	void *inline_start;
-	int inline_size;
-
-	if (ext4_get_inode_loc(dir, &iloc))
-		return NULL;
-
-	down_read(&EXT4_I(dir)->xattr_sem);
-	if (!ext4_has_inline_data(dir)) {
-		*has_inline_data = 0;
-		goto out;
-	}
-
-	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
-						EXT4_INLINE_DOTDOT_SIZE;
-	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
-	ret = search_dir(iloc.bh, inline_start, inline_size,
-			 dir, d_name, 0, res_dir);
-	if (ret == 1)
-		goto out_find;
-	if (ret < 0)
-		goto out;
-
-	if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
-		goto out;
-
-	inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
-	inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
-
-	ret = search_dir(iloc.bh, inline_start, inline_size,
-			 dir, d_name, 0, res_dir);
-	if (ret == 1)
-		goto out_find;
-
-out:
-	brelse(iloc.bh);
-	iloc.bh = NULL;
-out_find:
-	up_read(&EXT4_I(dir)->xattr_sem);
-	return iloc.bh;
-}
-
-int ext4_delete_inline_entry(handle_t *handle,
-			     struct inode *dir,
-			     struct ext4_dir_entry_2 *de_del,
-			     struct buffer_head *bh,
-			     int *has_inline_data)
-{
-	int err, inline_size;
-	struct ext4_iloc iloc;
-	void *inline_start;
-
-	err = ext4_get_inode_loc(dir, &iloc);
-	if (err)
-		return err;
-
-	down_write(&EXT4_I(dir)->xattr_sem);
-	if (!ext4_has_inline_data(dir)) {
-		*has_inline_data = 0;
-		goto out;
-	}
-
-	if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
-		EXT4_MIN_INLINE_DATA_SIZE) {
-		inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
-					EXT4_INLINE_DOTDOT_SIZE;
-		inline_size = EXT4_MIN_INLINE_DATA_SIZE -
-				EXT4_INLINE_DOTDOT_SIZE;
-	} else {
-		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
-		inline_size = ext4_get_inline_size(dir) -
-				EXT4_MIN_INLINE_DATA_SIZE;
-	}
-
-	err = ext4_journal_get_write_access(handle, bh);
-	if (err)
-		goto out;
-
-	err = ext4_generic_delete_entry(handle, dir, de_del, bh,
-					inline_start, inline_size, 0);
-	if (err)
-		goto out;
-
-	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	err = ext4_mark_inode_dirty(handle, dir);
-	if (unlikely(err))
-		goto out;
-
-	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
-out:
-	up_write(&EXT4_I(dir)->xattr_sem);
-	brelse(iloc.bh);
-	if (err != -ENOENT)
-		ext4_std_error(dir->i_sb, err);
-	return err;
-}
-
-/*
- * Get the inline dentry at offset.
- */
-static inline struct ext4_dir_entry_2 *
-ext4_get_inline_entry(struct inode *inode,
-		      struct ext4_iloc *iloc,
-		      unsigned int offset,
-		      void **inline_start,
-		      int *inline_size)
-{
-	void *inline_pos;
-
-	BUG_ON(offset > ext4_get_inline_size(inode));
-
-	if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
-		inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
-		*inline_size = EXT4_MIN_INLINE_DATA_SIZE;
-	} else {
-		inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
-		offset -= EXT4_MIN_INLINE_DATA_SIZE;
-		*inline_size = ext4_get_inline_size(inode) -
-				EXT4_MIN_INLINE_DATA_SIZE;
-	}
-
-	if (inline_start)
-		*inline_start = inline_pos;
-	return (struct ext4_dir_entry_2 *)(inline_pos + offset);
-}
-
-int empty_inline_dir(struct inode *dir, int *has_inline_data)
-{
-	int err, inline_size;
-	struct ext4_iloc iloc;
-	void *inline_pos;
-	unsigned int offset;
-	struct ext4_dir_entry_2 *de;
-	int ret = 1;
-
-	err = ext4_get_inode_loc(dir, &iloc);
-	if (err) {
-		EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
-				 err, dir->i_ino);
-		return 1;
-	}
-
-	down_read(&EXT4_I(dir)->xattr_sem);
-	if (!ext4_has_inline_data(dir)) {
-		*has_inline_data = 0;
-		goto out;
-	}
-
-	de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
-	if (!le32_to_cpu(de->inode)) {
-		ext4_warning(dir->i_sb,
-			     "bad inline directory (dir #%lu) - no `..'",
-			     dir->i_ino);
-		ret = 1;
-		goto out;
-	}
-
-	offset = EXT4_INLINE_DOTDOT_SIZE;
-	while (offset < dir->i_size) {
-		de = ext4_get_inline_entry(dir, &iloc, offset,
-					   &inline_pos, &inline_size);
-		if (ext4_check_dir_entry(dir, NULL, de,
-					 iloc.bh, inline_pos,
-					 inline_size, offset)) {
-			ext4_warning(dir->i_sb,
-				     "bad inline directory (dir #%lu) - "
-				     "inode %u, rec_len %u, name_len %d"
-				     "inline size %d\n",
-				     dir->i_ino, le32_to_cpu(de->inode),
-				     le16_to_cpu(de->rec_len), de->name_len,
-				     inline_size);
-			ret = 1;
-			goto out;
-		}
-		if (le32_to_cpu(de->inode)) {
-			ret = 0;
-			goto out;
-		}
-		offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
-	}
-
-out:
-	up_read(&EXT4_I(dir)->xattr_sem);
-	brelse(iloc.bh);
-	return ret;
-}
-
-int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
-{
-	int ret;
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-	ret = ext4_destroy_inline_data_nolock(handle, inode);
-	up_write(&EXT4_I(inode)->xattr_sem);
-
-	return ret;
-}
-
-int ext4_inline_data_fiemap(struct inode *inode,
-			    struct fiemap_extent_info *fieinfo,
-			    int *has_inline)
-{
-	__u64 physical = 0;
-	__u64 length;
-	__u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
-	int error = 0;
-	struct ext4_iloc iloc;
-
-	down_read(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		*has_inline = 0;
-		goto out;
-	}
-
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		goto out;
-
-	physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
-	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
-	physical += offsetof(struct ext4_inode, i_block);
-	length = i_size_read(inode);
-
-	if (physical)
-		error = fiemap_fill_next_extent(fieinfo, 0, physical,
-						length, flags);
-	brelse(iloc.bh);
-out:
-	up_read(&EXT4_I(inode)->xattr_sem);
-	return (error < 0 ? error : 0);
-}
-
-/*
- * Called during xattr set, and if we can sparse space 'needed',
- * just create the extent tree evict the data to the outer block.
- *
- * We use jbd2 instead of page cache to move data to the 1st block
- * so that the whole transaction can be committed as a whole and
- * the data isn't lost because of the delayed page cache write.
- */
-int ext4_try_to_evict_inline_data(handle_t *handle,
-				  struct inode *inode,
-				  int needed)
-{
-	int error;
-	struct ext4_xattr_entry *entry;
-	struct ext4_xattr_ibody_header *header;
-	struct ext4_inode *raw_inode;
-	struct ext4_iloc iloc;
-
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		return error;
-
-	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
-	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
-					    EXT4_I(inode)->i_inline_off);
-	if (EXT4_XATTR_LEN(entry->e_name_len) +
-	    EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
-		error = -ENOSPC;
-		goto out;
-	}
-
-	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
-out:
-	brelse(iloc.bh);
-	return error;
-}
-
-void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
-{
-	handle_t *handle;
-	int inline_size, value_len, needed_blocks;
-	size_t i_size;
-	void *value = NULL;
-	struct ext4_xattr_ibody_find is = {
-		.s = { .not_found = -ENODATA, },
-	};
-	struct ext4_xattr_info i = {
-		.name_index = EXT4_XATTR_INDEX_SYSTEM,
-		.name = EXT4_XATTR_SYSTEM_DATA,
-	};
-
-
-	needed_blocks = ext4_writepage_trans_blocks(inode);
-	handle = ext4_journal_start(inode, needed_blocks);
-	if (IS_ERR(handle))
-		return;
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		*has_inline = 0;
-		ext4_journal_stop(handle);
-		return;
-	}
-
-	if (ext4_orphan_add(handle, inode))
-		goto out;
-
-	if (ext4_get_inode_loc(inode, &is.iloc))
-		goto out;
-
-	down_write(&EXT4_I(inode)->i_data_sem);
-	i_size = inode->i_size;
-	inline_size = ext4_get_inline_size(inode);
-	EXT4_I(inode)->i_disksize = i_size;
-
-	if (i_size < inline_size) {
-		/* Clear the content in the xattr space. */
-		if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
-			if (ext4_xattr_ibody_find(inode, &i, &is))
-				goto out_error;
-
-			BUG_ON(is.s.not_found);
-
-			value_len = le32_to_cpu(is.s.here->e_value_size);
-			value = kmalloc(value_len, GFP_NOFS);
-			if (!value)
-				goto out_error;
-
-			if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
-						value, value_len))
-				goto out_error;
-
-			i.value = value;
-			i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
-					i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
-			if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
-				goto out_error;
-		}
-
-		/* Clear the content within i_blocks. */
-		if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
-			memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
-					EXT4_MIN_INLINE_DATA_SIZE - i_size);
-
-		EXT4_I(inode)->i_inline_size = i_size <
-					EXT4_MIN_INLINE_DATA_SIZE ?
-					EXT4_MIN_INLINE_DATA_SIZE : i_size;
-	}
-
-out_error:
-	up_write(&EXT4_I(inode)->i_data_sem);
-out:
-	brelse(is.iloc.bh);
-	up_write(&EXT4_I(inode)->xattr_sem);
-	kfree(value);
-	if (inode->i_nlink)
-		ext4_orphan_del(handle, inode);
-
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-
-	ext4_journal_stop(handle);
-	return;
-}
-
-int ext4_convert_inline_data(struct inode *inode)
-{
-	int error, needed_blocks;
-	handle_t *handle;
-	struct ext4_iloc iloc;
-
-	if (!ext4_has_inline_data(inode)) {
-		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-		return 0;
-	}
-
-	needed_blocks = ext4_writepage_trans_blocks(inode);
-
-	iloc.bh = NULL;
-	error = ext4_get_inode_loc(inode, &iloc);
-	if (error)
-		return error;
-
-	handle = ext4_journal_start(inode, needed_blocks);
-	if (IS_ERR(handle)) {
-		error = PTR_ERR(handle);
-		goto out_free;
-	}
-
-	down_write(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		up_write(&EXT4_I(inode)->xattr_sem);
-		goto out;
-	}
-
-	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
-	up_write(&EXT4_I(inode)->xattr_sem);
-out:
-	ext4_journal_stop(handle);
-out_free:
-	brelse(iloc.bh);
-	return error;
-}
diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c
index cb1c1ab2720b..b3c243b9afa5 100644
--- a/trunk/fs/ext4/inode.c
+++ b/trunk/fs/ext4/inode.c
@@ -483,6 +483,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 	return num;
 }
 
+/*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+				   struct ext4_map_blocks *map)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	int i, nr_pages;
+	pgoff_t index, end;
+
+	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (map->m_lblk + map->m_len - 1) >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index,
+					  min(end - index + 1,
+					      (pgoff_t)PAGEVEC_SIZE));
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			if (unlikely(page->mapping != mapping) ||
+			    !PageDirty(page))
+				break;
+
+			if (page_has_buffers(page)) {
+				bh = head = page_buffers(page);
+				do {
+					set_buffer_da_mapped(bh);
+					bh = bh->b_this_page;
+				} while (bh != head);
+			}
+			index++;
+		}
+		pagevec_release(&pvec);
+	}
+}
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -531,16 +574,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret;
-		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-			/* delayed alloc may be allocated by fallocate and
-			 * coverted to initialized by directIO.
-			 * we need to handle delayed extent here.
-			 */
-			down_write((&EXT4_I(inode)->i_data_sem));
-			goto delayed_mapped;
-		}
-		ret = check_block_validity(inode, map);
+		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -618,15 +652,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
-		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-			int ret;
-delayed_mapped:
-			/* delayed allocation blocks has been allocated */
-			ret = ext4_es_remove_extent(inode, map->m_lblk,
-						    map->m_len);
-			if (ret < 0)
-				retval = ret;
-		}
+		/* If we have successfully mapped the delayed allocated blocks,
+		 * set the BH_Da_Mapped bit on them. Its important to do this
+		 * under the protection of i_data_sem.
+		 */
+		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+			set_buffers_da_mapped(inode, map);
 	}
 
 	up_write((&EXT4_I(inode)->i_data_sem));
@@ -649,13 +680,10 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 	int ret = 0, started = 0;
 	int dio_credits;
 
-	if (ext4_has_inline_data(inode))
-		return -ERANGE;
-
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
+	if (flags && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
@@ -770,13 +798,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 	return NULL;
 }
 
-int ext4_walk_page_buffers(handle_t *handle,
-			   struct buffer_head *head,
-			   unsigned from,
-			   unsigned to,
-			   int *partial,
-			   int (*fn)(handle_t *handle,
-				     struct buffer_head *bh))
+static int walk_page_buffers(handle_t *handle,
+			     struct buffer_head *head,
+			     unsigned from,
+			     unsigned to,
+			     int *partial,
+			     int (*fn)(handle_t *handle,
+				       struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -826,8 +854,8 @@ int ext4_walk_page_buffers(handle_t *handle,
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
-int do_journal_get_write_access(handle_t *handle,
-				struct buffer_head *bh)
+static int do_journal_get_write_access(handle_t *handle,
+				       struct buffer_head *bh)
 {
 	int dirty = buffer_dirty(bh);
 	int ret;
@@ -850,7 +878,7 @@ int do_journal_get_write_access(handle_t *handle,
 	return ret;
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
@@ -874,17 +902,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
-		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
-						    flags, pagep);
-		if (ret < 0)
-			goto out;
-		if (ret == 1) {
-			ret = 0;
-			goto out;
-		}
-	}
-
 retry:
 	handle = ext4_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
@@ -902,7 +919,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		ret = -ENOMEM;
 		goto out;
 	}
-
 	*pagep = page;
 
 	if (ext4_should_dioread_nolock(inode))
@@ -911,9 +927,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		ret = __block_write_begin(page, pos, len, ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
-		ret = ext4_walk_page_buffers(handle, page_buffers(page),
-					     from, to, NULL,
-					     do_journal_get_write_access);
+		ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, NULL, do_journal_get_write_access);
 	}
 
 	if (ret) {
@@ -968,12 +983,7 @@ static int ext4_generic_write_end(struct file *file,
 	struct inode *inode = mapping->host;
 	handle_t *handle = ext4_journal_current_handle();
 
-	if (ext4_has_inline_data(inode))
-		copied = ext4_write_inline_data_end(inode, pos, len,
-						    copied, page);
-	else
-		copied = block_write_end(file, mapping, pos,
-					 len, copied, page, fsdata);
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
 	/*
 	 * No need to use i_size_read() here, the i_size
@@ -1124,21 +1134,16 @@ static int ext4_journalled_write_end(struct file *file,
 
 	BUG_ON(!ext4_handle_valid(handle));
 
-	if (ext4_has_inline_data(inode))
-		copied = ext4_write_inline_data_end(inode, pos, len,
-						    copied, page);
-	else {
-		if (copied < len) {
-			if (!PageUptodate(page))
-				copied = 0;
-			page_zero_new_buffers(page, from+copied, to);
-		}
-
-		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
-					     to, &partial, write_end_fn);
-		if (!partial)
-			SetPageUptodate(page);
+	if (copied < len) {
+		if (!PageUptodate(page))
+			copied = 0;
+		page_zero_new_buffers(page, from+copied, to);
 	}
+
+	ret = walk_page_buffers(handle, page_buffers(page), from,
+				to, &partial, write_end_fn);
+	if (!partial)
+		SetPageUptodate(page);
 	new_i_size = pos + copied;
 	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
@@ -1296,7 +1301,6 @@ static void ext4_da_page_release_reservation(struct page *page,
 	struct inode *inode = page->mapping->host;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int num_clusters;
-	ext4_fsblk_t lblk;
 
 	head = page_buffers(page);
 	bh = head;
@@ -1306,23 +1310,20 @@ static void ext4_da_page_release_reservation(struct page *page,
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
+			clear_buffer_da_mapped(bh);
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
 
-	if (to_release) {
-		lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-		ext4_es_remove_extent(inode, lblk, to_release);
-	}
-
 	/* If we have released all the blocks belonging to a cluster, then we
 	 * need to release the reserved space for that cluster. */
 	num_clusters = EXT4_NUM_B2C(sbi, to_release);
 	while (num_clusters > 0) {
+		ext4_fsblk_t lblk;
 		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
 			((num_clusters - 1) << sbi->s_cluster_bits);
 		if (sbi->s_cluster_ratio == 1 ||
-		    !ext4_find_delalloc_cluster(inode, lblk))
+		    !ext4_find_delalloc_cluster(inode, lblk, 1))
 			ext4_da_release_space(inode, 1);
 
 		num_clusters--;
@@ -1428,6 +1429,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 						clear_buffer_delay(bh);
 						bh->b_blocknr = pblock;
 					}
+					if (buffer_da_mapped(bh))
+						clear_buffer_da_mapped(bh);
 					if (buffer_unwritten(bh) ||
 					    buffer_mapped(bh))
 						BUG_ON(bh->b_blocknr != pblock);
@@ -1497,16 +1500,9 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	ext4_lblk_t start, last;
 
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
-
-	start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	ext4_es_remove_extent(inode, start, last - start + 1);
-
-	pagevec_init(&pvec, 0);
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
@@ -1660,6 +1656,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 
 		for (i = 0; i < map.m_len; i++)
 			unmap_underlying_metadata(bdev, map.m_pblk + i);
+
+		if (ext4_should_order_data(mpd->inode)) {
+			err = ext4_jbd2_file_inode(handle, mpd->inode);
+			if (err) {
+				/* Only if the journal is aborted */
+				mpd->retval = err;
+				goto submit_io;
+			}
+		}
 	}
 
 	/*
@@ -1790,19 +1795,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	if (ext4_has_inline_data(inode)) {
-		/*
-		 * We will soon create blocks for this page, and let
-		 * us pretend as if the blocks aren't allocated yet.
-		 * In case of clusters, we have to handle the work
-		 * of mapping from cluster so that the reserved space
-		 * is calculated properly.
-		 */
-		if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
-		    ext4_find_delalloc_cluster(inode, map->m_lblk))
-			map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-		retval = 0;
-	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
 	else
 		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1821,10 +1814,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 				goto out_unlock;
 		}
 
-		retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
-		if (retval)
-			goto out_unlock;
-
 		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
 		 * and it should not appear on the bh->b_state.
 		 */
@@ -1853,8 +1842,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
  */
-int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-			   struct buffer_head *bh, int create)
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh, int create)
 {
 	struct ext4_map_blocks map;
 	int ret = 0;
@@ -1928,29 +1917,15 @@ static int __ext4_journalled_writepage(struct page *page,
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
-	struct buffer_head *page_bufs = NULL;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
-	int ret = 0, err = 0;
-	int inline_data = ext4_has_inline_data(inode);
-	struct buffer_head *inode_bh = NULL;
+	int ret = 0;
+	int err;
 
 	ClearPageChecked(page);
-
-	if (inline_data) {
-		BUG_ON(page->index != 0);
-		BUG_ON(len > ext4_get_max_inline_size(inode));
-		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
-		if (inode_bh == NULL)
-			goto out;
-	} else {
-		page_bufs = page_buffers(page);
-		if (!page_bufs) {
-			BUG();
-			goto out;
-		}
-		ext4_walk_page_buffers(handle, page_bufs, 0, len,
-				       NULL, bget_one);
-	}
+	page_bufs = page_buffers(page);
+	BUG_ON(!page_bufs);
+	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
 	/* As soon as we unlock the page, it can go away, but we have
 	 * references to buffers so we are safe */
 	unlock_page(page);
@@ -1963,18 +1938,11 @@ static int __ext4_journalled_writepage(struct page *page,
 
 	BUG_ON(!ext4_handle_valid(handle));
 
-	if (inline_data) {
-		ret = ext4_journal_get_write_access(handle, inode_bh);
-
-		err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+				do_journal_get_write_access);
 
-	} else {
-		ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
-					     do_journal_get_write_access);
-
-		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
-					     write_end_fn);
-	}
+	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+				write_end_fn);
 	if (ret == 0)
 		ret = err;
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1982,12 +1950,9 @@ static int __ext4_journalled_writepage(struct page *page,
 	if (!ret)
 		ret = err;
 
-	if (!ext4_has_inline_data(inode))
-		ext4_walk_page_buffers(handle, page_bufs, 0, len,
-				       NULL, bput_one);
+	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
-	brelse(inode_bh);
 	return ret;
 }
 
@@ -2064,8 +2029,8 @@ static int ext4_writepage(struct page *page,
 		commit_write = 1;
 	}
 	page_bufs = page_buffers(page);
-	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-				   ext4_bh_delay_or_unwritten)) {
+	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+			      ext4_bh_delay_or_unwritten)) {
 		/*
 		 * We don't want to do block allocation, so redirty
 		 * the page and return.  We may reach here when we do
@@ -2131,8 +2096,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  * mpage_da_map_and_submit to map a single contiguous memory region
  * and then write them.
  */
-static int write_cache_pages_da(handle_t *handle,
-				struct address_space *mapping,
+static int write_cache_pages_da(struct address_space *mapping,
 				struct writeback_control *wbc,
 				struct mpage_da_data *mpd,
 				pgoff_t *done_index)
@@ -2211,17 +2175,6 @@ static int write_cache_pages_da(handle_t *handle,
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 
-			/*
-			 * If we have inline data and arrive here, it means that
-			 * we will soon create the block for the 1st page, so
-			 * we'd better clear the inline data here.
-			 */
-			if (ext4_has_inline_data(inode)) {
-				BUG_ON(ext4_test_inode_state(inode,
-						EXT4_STATE_MAY_INLINE_DATA));
-				ext4_destroy_inline_data(handle, inode);
-			}
-
 			if (mpd->next_page != page->index)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
@@ -2428,8 +2381,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 		 * contiguous region of logical blocks that need
 		 * blocks to be allocated by ext4 and submit them.
 		 */
-		ret = write_cache_pages_da(handle, mapping,
-					   wbc, &mpd, &done_index);
+		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
@@ -2493,6 +2445,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	return ret;
 }
 
+#define FALL_BACK_TO_NONDELALLOC 1
 static int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_blocks, dirty_blocks;
@@ -2549,19 +2502,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	}
 	*fsdata = (void *)0;
 	trace_ext4_da_write_begin(inode, pos, len, flags);
-
-	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
-		ret = ext4_da_write_inline_data_begin(mapping, inode,
-						      pos, len, flags,
-						      pagep, fsdata);
-		if (ret < 0)
-			goto out;
-		if (ret == 1) {
-			ret = 0;
-			goto out;
-		}
-	}
-
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2663,13 +2603,22 @@ static int ext4_da_write_end(struct file *file,
 	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 	 * into that.
 	 */
+
 	new_i_size = pos + copied;
 	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
-		if (ext4_has_inline_data(inode) ||
-		    ext4_da_should_update_i_disksize(page, end)) {
+		if (ext4_da_should_update_i_disksize(page, end)) {
 			down_write(&EXT4_I(inode)->i_data_sem);
-			if (new_i_size > EXT4_I(inode)->i_disksize)
+			if (new_i_size > EXT4_I(inode)->i_disksize) {
+				/*
+				 * Updating i_disksize when extending file
+				 * without needing block allocation
+				 */
+				if (ext4_should_order_data(inode))
+					ret = ext4_jbd2_file_inode(handle,
+								   inode);
+
 				EXT4_I(inode)->i_disksize = new_i_size;
+			}
 			up_write(&EXT4_I(inode)->i_data_sem);
 			/* We need to mark inode dirty even if
 			 * new_i_size is less that inode->i_size
@@ -2678,16 +2627,8 @@ static int ext4_da_write_end(struct file *file,
 			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
-
-	if (write_mode != CONVERT_INLINE_DATA &&
-	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
-	    ext4_has_inline_data(inode))
-		ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
-						     page);
-	else
-		ret2 = generic_write_end(file, mapping, pos, len, copied,
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
@@ -2780,12 +2721,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 	journal_t *journal;
 	int err;
 
-	/*
-	 * We can get here for an inline file via the FIBMAP ioctl
-	 */
-	if (ext4_has_inline_data(inode))
-		return 0;
-
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
 			test_opt(inode->i_sb, DELALLOC)) {
 		/*
@@ -2831,30 +2766,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 
 static int ext4_readpage(struct file *file, struct page *page)
 {
-	int ret = -EAGAIN;
-	struct inode *inode = page->mapping->host;
-
 	trace_ext4_readpage(page);
-
-	if (ext4_has_inline_data(inode))
-		ret = ext4_readpage_inline(inode, page);
-
-	if (ret == -EAGAIN)
-		return mpage_readpage(page, ext4_get_block);
-
-	return ret;
+	return mpage_readpage(page, ext4_get_block);
 }
 
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct inode *inode = mapping->host;
-
-	/* If the file has inline data, no need to do readpages. */
-	if (ext4_has_inline_data(inode))
-		return 0;
-
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
@@ -2921,7 +2840,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * We allocate an uinitialized extent if blocks haven't been allocated.
  * The extent will be converted to initialized after the IO is complete.
  */
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2931,12 +2850,29 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
 }
 
 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create)
+		   struct buffer_head *bh_result, int flags)
 {
-	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
-		   inode->i_ino, create);
-	return _ext4_get_block(inode, iblock, bh_result,
-			       EXT4_GET_BLOCKS_NO_LOCK);
+	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
+	int ret = 0;
+
+	ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
+		   inode->i_ino, flags);
+
+	flags = EXT4_GET_BLOCKS_NO_LOCK;
+
+	map.m_lblk = iblock;
+	map.m_len = bh_result->b_size >> inode->i_blkbits;
+
+	ret = ext4_map_blocks(handle, inode, &map, flags);
+	if (ret > 0) {
+		map_bh(bh_result, inode->i_sb, map.m_pblk);
+		bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
+					map.m_flags;
+		bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
+		ret = 0;
+	}
+	return ret;
 }
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3042,10 +2978,10 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
  * fall back to buffered IO.
  *
  * For holes, we fallocate those blocks, mark them as uninitialized
- * If those blocks were preallocated, we mark sure they are split, but
+ * If those blocks were preallocated, we mark sure they are splited, but
  * still keep the range to write as uninitialized.
  *
- * The unwritten extents will be converted to written when DIO is completed.
+ * The unwrritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
  * set up an end_io call back function, which will do the conversion
  * when async direct IO completed.
@@ -3063,120 +2999,125 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	size_t count = iov_length(iov, nr_segs);
-	int overwrite = 0;
-	get_block_t *get_block_func = NULL;
-	int dio_flags = 0;
-	loff_t final_size = offset + count;
 
-	/* Use the old path for reads and writes beyond i_size. */
-	if (rw != WRITE || final_size > inode->i_size)
-		return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+	loff_t final_size = offset + count;
+	if (rw == WRITE && final_size <= inode->i_size) {
+		int overwrite = 0;
 
-	BUG_ON(iocb->private == NULL);
+		BUG_ON(iocb->private == NULL);
 
-	/* If we do a overwrite dio, i_mutex locking can be released */
-	overwrite = *((int *)iocb->private);
+		/* If we do a overwrite dio, i_mutex locking can be released */
+		overwrite = *((int *)iocb->private);
 
-	if (overwrite) {
-		atomic_inc(&inode->i_dio_count);
-		down_read(&EXT4_I(inode)->i_data_sem);
-		mutex_unlock(&inode->i_mutex);
-	}
-
-	/*
-	 * We could direct write to holes and fallocate.
-	 *
-	 * Allocated blocks to fill the hole are marked as
-	 * uninitialized to prevent parallel buffered read to expose
-	 * the stale data before DIO complete the data IO.
-	 *
-	 * As to previously fallocated extents, ext4 get_block will
-	 * just simply mark the buffer mapped but still keep the
-	 * extents uninitialized.
-	 *
-	 * For non AIO case, we will convert those unwritten extents
-	 * to written after return back from blockdev_direct_IO.
-	 *
-	 * For async DIO, the conversion needs to be deferred when the
-	 * IO is completed. The ext4 end_io callback function will be
-	 * called to take care of the conversion work.  Here for async
-	 * case, we allocate an io_end structure to hook to the iocb.
-	 */
-	iocb->private = NULL;
-	ext4_inode_aio_set(inode, NULL);
-	if (!is_sync_kiocb(iocb)) {
-		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
-		if (!io_end) {
-			ret = -ENOMEM;
-			goto retake_lock;
+		if (overwrite) {
+			atomic_inc(&inode->i_dio_count);
+			down_read(&EXT4_I(inode)->i_data_sem);
+			mutex_unlock(&inode->i_mutex);
 		}
-		io_end->flag |= EXT4_IO_END_DIRECT;
-		iocb->private = io_end;
-		/*
-		 * we save the io structure for current async direct
-		 * IO, so that later ext4_map_blocks() could flag the
-		 * io structure whether there is a unwritten extents
-		 * needs to be converted when IO is completed.
-		 */
-		ext4_inode_aio_set(inode, io_end);
-	}
 
-	if (overwrite) {
-		get_block_func = ext4_get_block_write_nolock;
-	} else {
-		get_block_func = ext4_get_block_write;
-		dio_flags = DIO_LOCKING;
-	}
-	ret = __blockdev_direct_IO(rw, iocb, inode,
-				   inode->i_sb->s_bdev, iov,
-				   offset, nr_segs,
-				   get_block_func,
-				   ext4_end_io_dio,
-				   NULL,
-				   dio_flags);
-
-	if (iocb->private)
-		ext4_inode_aio_set(inode, NULL);
-	/*
-	 * The io_end structure takes a reference to the inode, that
-	 * structure needs to be destroyed and the reference to the
-	 * inode need to be dropped, when IO is complete, even with 0
-	 * byte write, or failed.
-	 *
-	 * In the successful AIO DIO case, the io_end structure will
-	 * be destroyed and the reference to the inode will be dropped
-	 * after the end_io call back function is called.
-	 *
-	 * In the case there is 0 byte write, or error case, since VFS
-	 * direct IO won't invoke the end_io call back function, we
-	 * need to free the end_io structure here.
-	 */
-	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-		ext4_free_io_end(iocb->private);
+		/*
+ 		 * We could direct write to holes and fallocate.
+		 *
+ 		 * Allocated blocks to fill the hole are marked as uninitialized
+ 		 * to prevent parallel buffered read to expose the stale data
+ 		 * before DIO complete the data IO.
+		 *
+ 		 * As to previously fallocated extents, ext4 get_block
+ 		 * will just simply mark the buffer mapped but still
+ 		 * keep the extents uninitialized.
+ 		 *
+		 * for non AIO case, we will convert those unwritten extents
+		 * to written after return back from blockdev_direct_IO.
+		 *
+		 * for async DIO, the conversion needs to be defered when
+		 * the IO is completed. The ext4 end_io callback function
+		 * will be called to take care of the conversion work.
+		 * Here for async case, we allocate an io_end structure to
+		 * hook to the iocb.
+ 		 */
 		iocb->private = NULL;
-	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-						EXT4_STATE_DIO_UNWRITTEN)) {
-		int err;
+		ext4_inode_aio_set(inode, NULL);
+		if (!is_sync_kiocb(iocb)) {
+			ext4_io_end_t *io_end =
+				ext4_init_io_end(inode, GFP_NOFS);
+			if (!io_end) {
+				ret = -ENOMEM;
+				goto retake_lock;
+			}
+			io_end->flag |= EXT4_IO_END_DIRECT;
+			iocb->private = io_end;
+			/*
+			 * we save the io structure for current async
+			 * direct IO, so that later ext4_map_blocks()
+			 * could flag the io structure whether there
+			 * is a unwritten extents needs to be converted
+			 * when IO is completed.
+			 */
+			ext4_inode_aio_set(inode, io_end);
+		}
+
+		if (overwrite)
+			ret = __blockdev_direct_IO(rw, iocb, inode,
+						 inode->i_sb->s_bdev, iov,
+						 offset, nr_segs,
+						 ext4_get_block_write_nolock,
+						 ext4_end_io_dio,
+						 NULL,
+						 0);
+		else
+			ret = __blockdev_direct_IO(rw, iocb, inode,
+						 inode->i_sb->s_bdev, iov,
+						 offset, nr_segs,
+						 ext4_get_block_write,
+						 ext4_end_io_dio,
+						 NULL,
+						 DIO_LOCKING);
+		if (iocb->private)
+			ext4_inode_aio_set(inode, NULL);
 		/*
-		 * for non AIO case, since the IO is already
-		 * completed, we could do the conversion right here
+		 * The io_end structure takes a reference to the inode,
+		 * that structure needs to be destroyed and the
+		 * reference to the inode need to be dropped, when IO is
+		 * complete, even with 0 byte write, or failed.
+		 *
+		 * In the successful AIO DIO case, the io_end structure will be
+		 * desctroyed and the reference to the inode will be dropped
+		 * after the end_io call back function is called.
+		 *
+		 * In the case there is 0 byte write, or error case, since
+		 * VFS direct IO won't invoke the end_io call back function,
+		 * we need to free the end_io structure here.
 		 */
-		err = ext4_convert_unwritten_extents(inode,
-						     offset, ret);
-		if (err < 0)
-			ret = err;
-		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-	}
+		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+			ext4_free_io_end(iocb->private);
+			iocb->private = NULL;
+		} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+						EXT4_STATE_DIO_UNWRITTEN)) {
+			int err;
+			/*
+			 * for non AIO case, since the IO is already
+			 * completed, we could do the conversion right here
+			 */
+			err = ext4_convert_unwritten_extents(inode,
+							     offset, ret);
+			if (err < 0)
+				ret = err;
+			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		}
 
-retake_lock:
-	/* take i_mutex locking again if we do a ovewrite dio */
-	if (overwrite) {
-		inode_dio_done(inode);
-		up_read(&EXT4_I(inode)->i_data_sem);
-		mutex_lock(&inode->i_mutex);
+	retake_lock:
+		/* take i_mutex locking again if we do a ovewrite dio */
+		if (overwrite) {
+			inode_dio_done(inode);
+			up_read(&EXT4_I(inode)->i_data_sem);
+			mutex_lock(&inode->i_mutex);
+		}
+
+		return ret;
 	}
 
-	return ret;
+	/* for write the the end of file case, we fall back to old way */
+	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 }
 
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3193,10 +3134,6 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	if (ext4_should_journal_data(inode))
 		return 0;
 
-	/* Let buffer I/O handle the inline data case. */
-	if (ext4_has_inline_data(inode))
-		return 0;
-
 	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3594,14 +3531,6 @@ void ext4_truncate(struct inode *inode)
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
-	if (ext4_has_inline_data(inode)) {
-		int has_inline = 1;
-
-		ext4_inline_data_truncate(inode, &has_inline);
-		if (has_inline)
-			return;
-	}
-
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ext4_ext_truncate(inode);
 	else
@@ -3827,19 +3756,6 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 	}
 }
 
-static inline void ext4_iget_extra_inode(struct inode *inode,
-					 struct ext4_inode *raw_inode,
-					 struct ext4_inode_info *ei)
-{
-	__le32 *magic = (void *)raw_inode +
-			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
-	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
-		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
-		ext4_find_inline_data_nolock(inode);
-	} else
-		EXT4_I(inode)->i_inline_off = 0;
-}
-
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
@@ -3910,7 +3826,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
-	ei->i_inline_off = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -3983,7 +3898,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
-			ext4_iget_extra_inode(inode, raw_inode, ei);
+			__le32 *magic = (void *)raw_inode +
+					EXT4_GOOD_OLD_INODE_SIZE +
+					ei->i_extra_isize;
+			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		}
 	}
 
@@ -4006,19 +3925,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
-	} else if (!ext4_has_inline_data(inode)) {
-		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-			if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-			    (S_ISLNK(inode->i_mode) &&
-			     !ext4_inode_is_fast_symlink(inode))))
-				/* Validate extent which is part of inode */
-				ret = ext4_ext_check_inode(inode);
-		} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-			   (S_ISLNK(inode->i_mode) &&
-			    !ext4_inode_is_fast_symlink(inode))) {
-			/* Validate block references which are part of inode */
-			ret = ext4_ind_check_inode(inode);
-		}
+	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		    (S_ISLNK(inode->i_mode) &&
+		     !ext4_inode_is_fast_symlink(inode)))
+			/* Validate extent which is part of inode */
+			ret = ext4_ext_check_inode(inode);
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		   (S_ISLNK(inode->i_mode) &&
+		    !ext4_inode_is_fast_symlink(inode))) {
+		/* Validate block references which are part of inode */
+		ret = ext4_ind_check_inode(inode);
 	}
 	if (ret)
 		goto bad_inode;
@@ -4205,10 +4122,9 @@ static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
-	} else if (!ext4_has_inline_data(inode)) {
+	} else
 		for (block = 0; block < EXT4_N_BLOCKS; block++)
 			raw_inode->i_block[block] = ei->i_data[block];
-	}
 
 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 	if (ei->i_extra_isize) {
@@ -4895,9 +4811,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * journal_start/journal_stop which can block and take a long time
 	 */
 	if (page_has_buffers(page)) {
-		if (!ext4_walk_page_buffers(NULL, page_buffers(page),
-					    0, len, NULL,
-					    ext4_bh_unmapped)) {
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped)) {
 			/* Wait so that we don't change page under IO */
 			wait_on_page_writeback(page);
 			ret = VM_FAULT_LOCKED;
@@ -4918,7 +4833,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	ret = __block_page_mkwrite(vma, vmf, get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
-		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+		if (walk_page_buffers(handle, page_buffers(page), 0,
 			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
 			unlock_page(page);
 			ret = VM_FAULT_SIGBUS;
diff --git a/trunk/fs/ext4/mballoc.c b/trunk/fs/ext4/mballoc.c
index 1bf6fe785c4f..526e55358606 100644
--- a/trunk/fs/ext4/mballoc.c
+++ b/trunk/fs/ext4/mballoc.c
@@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
 	ex->fe_start += next;
 
 	while (needed > ex->fe_len &&
-	       mb_find_buddy(e4b, order, &max)) {
+	       (buddy = mb_find_buddy(e4b, order, &max))) {
 
 		if (block + 1 >= max)
 			break;
@@ -2607,17 +2607,9 @@ static void ext4_free_data_callback(struct super_block *sb,
 	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 		 entry->efd_count, entry->efd_group, entry);
 
-	if (test_opt(sb, DISCARD)) {
-		err = ext4_issue_discard(sb, entry->efd_group,
-					 entry->efd_start_cluster,
-					 entry->efd_count);
-		if (err && err != -EOPNOTSUPP)
-			ext4_msg(sb, KERN_WARNING, "discard request in"
-				 " group:%d block:%d count:%d failed"
-				 " with %d", entry->efd_group,
-				 entry->efd_start_cluster,
-				 entry->efd_count, err);
-	}
+	if (test_opt(sb, DISCARD))
+		ext4_issue_discard(sb, entry->efd_group,
+				   entry->efd_start_cluster, entry->efd_count);
 
 	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
 	/* we expect to find existing buddy because it's pinned */
@@ -4318,10 +4310,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 repeat:
 		/* allocate space in core */
 		*errp = ext4_mb_regular_allocator(ac);
-		if (*errp) {
-			ext4_discard_allocated_blocks(ac);
+		if (*errp)
 			goto errout;
-		}
 
 		/* as we've just preallocated more space than
 		 * user requested orinally, we store allocated
@@ -4343,10 +4333,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			ac->ac_b_ex.fe_len = 0;
 			ac->ac_status = AC_STATUS_CONTINUE;
 			goto repeat;
-		} else if (*errp) {
+		} else if (*errp)
+		errout:
 			ext4_discard_allocated_blocks(ac);
-			goto errout;
-		} else {
+		else {
 			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 			ar->len = ac->ac_b_ex.fe_len;
 		}
@@ -4357,7 +4347,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		*errp = -ENOSPC;
 	}
 
-errout:
 	if (*errp) {
 		ac->ac_b_ex.fe_len = 0;
 		ar->len = 0;
@@ -4667,16 +4656,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
-		if (test_opt(sb, DISCARD)) {
-			err = ext4_issue_discard(sb, block_group, bit, count);
-			if (err && err != -EOPNOTSUPP)
-				ext4_msg(sb, KERN_WARNING, "discard request in"
-					 " group:%d block:%d count:%lu failed"
-					 " with %d", block_group, bit, count,
-					 err);
-		}
-
-
+		if (test_opt(sb, DISCARD))
+			ext4_issue_discard(sb, block_group, bit, count);
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
 		mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4870,11 +4851,10 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
 			     ext4_group_t group, struct ext4_buddy *e4b)
 {
 	struct ext4_free_extent ex;
-	int ret = 0;
 
 	trace_ext4_trim_extent(sb, group, start, count);
 
@@ -4890,10 +4870,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 	 */
 	mb_mark_used(e4b, &ex);
 	ext4_unlock_group(sb, group);
-	ret = ext4_issue_discard(sb, group, start, count);
+	ext4_issue_discard(sb, group, start, count);
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
-	return ret;
 }
 
 /**
@@ -4922,7 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	void *bitmap;
 	ext4_grpblk_t next, count = 0, free_count = 0;
 	struct ext4_buddy e4b;
-	int ret = 0;
+	int ret;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
 
@@ -4949,11 +4928,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 		next = mb_find_next_bit(bitmap, max + 1, start);
 
 		if ((next - start) >= minblocks) {
-			ret = ext4_trim_extent(sb, start,
-					       next - start, group, &e4b);
-			if (ret && ret != -EOPNOTSUPP)
-				break;
-			ret = 0;
+			ext4_trim_extent(sb, start,
+					 next - start, group, &e4b);
 			count += next - start;
 		}
 		free_count += next - start;
@@ -4974,10 +4950,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 			break;
 	}
 
-	if (!ret) {
-		ret = count;
+	if (!ret)
 		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
-	}
 out:
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
@@ -4985,7 +4959,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	ext4_debug("trimmed %d blocks in the group %d\n",
 		count, group);
 
-	return ret;
+	return count;
 }
 
 /**
diff --git a/trunk/fs/ext4/migrate.c b/trunk/fs/ext4/migrate.c
index db8226d595fa..f1bb32ec0169 100644
--- a/trunk/fs/ext4/migrate.c
+++ b/trunk/fs/ext4/migrate.c
@@ -14,7 +14,6 @@
 
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
diff --git a/trunk/fs/ext4/move_extent.c b/trunk/fs/ext4/move_extent.c
index d9cc5ee42f53..292daeeed455 100644
--- a/trunk/fs/ext4/move_extent.c
+++ b/trunk/fs/ext4/move_extent.c
@@ -18,7 +18,6 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "ext4_extents.h"
 
 /**
  * get_ext_path - Find an extent path for designated logical block number.
diff --git a/trunk/fs/ext4/namei.c b/trunk/fs/ext4/namei.c
index cac448282331..6d600a69fc9d 100644
--- a/trunk/fs/ext4/namei.c
+++ b/trunk/fs/ext4/namei.c
@@ -202,8 +202,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
 /* checksumming functions */
-void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-			    unsigned int blocksize)
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+					((blocksize) - \
+					 sizeof(struct ext4_dir_entry_tail))))
+
+static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+				   unsigned int blocksize)
 {
 	memset(t, 0, sizeof(struct ext4_dir_entry_tail));
 	t->det_rec_len = ext4_rec_len_to_disk(
@@ -256,12 +261,6 @@ static __le32 ext4_dirent_csum(struct inode *inode,
 	return cpu_to_le32(csum);
 }
 
-static void warn_no_space_for_csum(struct inode *inode)
-{
-	ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
-		     "checksum.  Please run e2fsck -D.", inode->i_ino);
-}
-
 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 {
 	struct ext4_dir_entry_tail *t;
@@ -272,7 +271,8 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 
 	t = get_dirent_tail(inode, dirent);
 	if (!t) {
-		warn_no_space_for_csum(inode);
+		EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+				 "leaf for checksum.  Please run e2fsck -D.");
 		return 0;
 	}
 
@@ -294,7 +294,8 @@ static void ext4_dirent_csum_set(struct inode *inode,
 
 	t = get_dirent_tail(inode, dirent);
 	if (!t) {
-		warn_no_space_for_csum(inode);
+		EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+				 "leaf for checksum.  Please run e2fsck -D.");
 		return;
 	}
 
@@ -302,9 +303,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
 					   (void *)t - (void *)dirent);
 }
 
-int ext4_handle_dirty_dirent_node(handle_t *handle,
-				  struct inode *inode,
-				  struct buffer_head *bh)
+static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
+						struct inode *inode,
+						struct buffer_head *bh)
 {
 	ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
 	return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -376,7 +377,8 @@ static int ext4_dx_csum_verify(struct inode *inode,
 	count = le16_to_cpu(c->count);
 	if (count_offset + (limit * sizeof(struct dx_entry)) >
 	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
-		warn_no_space_for_csum(inode);
+		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+				 "tree checksum found.  Run e2fsck -D.");
 		return 1;
 	}
 	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -406,7 +408,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
 	count = le16_to_cpu(c->count);
 	if (count_offset + (limit * sizeof(struct dx_entry)) >
 	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
-		warn_no_space_for_csum(inode);
+		EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+				 "tree checksum.  Run e2fsck -D.");
 		return;
 	}
 	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -887,7 +890,6 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   EXT4_DIR_REC_LEN(0));
 	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
 		if (ext4_check_dir_entry(dir, NULL, de, bh,
-				bh->b_data, bh->b_size,
 				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 					 + ((char *)de - bh->b_data))) {
 			/* On error, skip the f_pos to the next block. */
@@ -1005,15 +1007,6 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	return (err);
 }
 
-static inline int search_dirblock(struct buffer_head *bh,
-				  struct inode *dir,
-				  const struct qstr *d_name,
-				  unsigned int offset,
-				  struct ext4_dir_entry_2 **res_dir)
-{
-	return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
-			  d_name, offset, res_dir);
-}
 
 /*
  * Directory block splitting, compacting
@@ -1088,6 +1081,13 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 	dx_set_count(entries, count + 1);
 }
 
+static void ext4_update_dx_flag(struct inode *inode)
+{
+	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT4_FEATURE_COMPAT_DIR_INDEX))
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+
 /*
  * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
  *
@@ -1107,13 +1107,11 @@ static inline int ext4_match (int len, const char * const name,
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-int search_dir(struct buffer_head *bh,
-	       char *search_buf,
-	       int buf_size,
-	       struct inode *dir,
-	       const struct qstr *d_name,
-	       unsigned int offset,
-	       struct ext4_dir_entry_2 **res_dir)
+static inline int search_dirblock(struct buffer_head *bh,
+				  struct inode *dir,
+				  const struct qstr *d_name,
+				  unsigned int offset,
+				  struct ext4_dir_entry_2 ** res_dir)
 {
 	struct ext4_dir_entry_2 * de;
 	char * dlimit;
@@ -1121,8 +1119,8 @@ int search_dir(struct buffer_head *bh,
 	const char *name = d_name->name;
 	int namelen = d_name->len;
 
-	de = (struct ext4_dir_entry_2 *)search_buf;
-	dlimit = search_buf + buf_size;
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	dlimit = bh->b_data + dir->i_sb->s_blocksize;
 	while ((char *) de < dlimit) {
 		/* this code is executed quadratically often */
 		/* do minimal checking `by hand' */
@@ -1130,8 +1128,7 @@ int search_dir(struct buffer_head *bh,
 		if ((char *) de + namelen <= dlimit &&
 		    ext4_match (namelen, name, de)) {
 			/* found a match - just to be sure, do a full check */
-			if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
-						 bh->b_size, offset))
+			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
 				return -1;
 			*res_dir = de;
 			return 1;
@@ -1147,21 +1144,6 @@ int search_dir(struct buffer_head *bh,
 	return 0;
 }
 
-static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
-			       struct ext4_dir_entry *de)
-{
-	struct super_block *sb = dir->i_sb;
-
-	if (!is_dx(dir))
-		return 0;
-	if (block == 0)
-		return 1;
-	if (de->inode == 0 &&
-	    ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
-			sb->s_blocksize)
-		return 1;
-	return 0;
-}
 
 /*
  *	ext4_find_entry()
@@ -1176,8 +1158,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
  */
 static struct buffer_head * ext4_find_entry (struct inode *dir,
 					const struct qstr *d_name,
-					struct ext4_dir_entry_2 **res_dir,
-					int *inlined)
+					struct ext4_dir_entry_2 ** res_dir)
 {
 	struct super_block *sb;
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1198,18 +1179,6 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
-
-	if (ext4_has_inline_data(dir)) {
-		int has_inline_data = 1;
-		ret = ext4_find_inline_entry(dir, d_name, res_dir,
-					     &has_inline_data);
-		if (has_inline_data) {
-			if (inlined)
-				*inlined = 1;
-			return ret;
-		}
-	}
-
 	if ((namelen <= 2) && (name[0] == '.') &&
 	    (name[1] == '.' || name[1] == '\0')) {
 		/*
@@ -1275,8 +1244,6 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 			goto next;
 		}
 		if (!buffer_verified(bh) &&
-		    !is_dx_internal_node(dir, block,
-					 (struct ext4_dir_entry *)bh->b_data) &&
 		    !ext4_dirent_csum_verify(dir,
 				(struct ext4_dir_entry *)bh->b_data)) {
 			EXT4_ERROR_INODE(dir, "checksumming directory "
@@ -1394,7 +1361,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
 		__u32 ino = le32_to_cpu(de->inode);
@@ -1428,7 +1395,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	struct ext4_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
+	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
@@ -1626,63 +1593,6 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	return NULL;
 }
 
-int ext4_find_dest_de(struct inode *dir, struct inode *inode,
-		      struct buffer_head *bh,
-		      void *buf, int buf_size,
-		      const char *name, int namelen,
-		      struct ext4_dir_entry_2 **dest_de)
-{
-	struct ext4_dir_entry_2 *de;
-	unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
-	int nlen, rlen;
-	unsigned int offset = 0;
-	char *top;
-
-	de = (struct ext4_dir_entry_2 *)buf;
-	top = buf + buf_size - reclen;
-	while ((char *) de <= top) {
-		if (ext4_check_dir_entry(dir, NULL, de, bh,
-					 buf, buf_size, offset))
-			return -EIO;
-		if (ext4_match(namelen, name, de))
-			return -EEXIST;
-		nlen = EXT4_DIR_REC_LEN(de->name_len);
-		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
-		if ((de->inode ? rlen - nlen : rlen) >= reclen)
-			break;
-		de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
-		offset += rlen;
-	}
-	if ((char *) de > top)
-		return -ENOSPC;
-
-	*dest_de = de;
-	return 0;
-}
-
-void ext4_insert_dentry(struct inode *inode,
-			struct ext4_dir_entry_2 *de,
-			int buf_size,
-			const char *name, int namelen)
-{
-
-	int nlen, rlen;
-
-	nlen = EXT4_DIR_REC_LEN(de->name_len);
-	rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
-	if (de->inode) {
-		struct ext4_dir_entry_2 *de1 =
-				(struct ext4_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
-		de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
-		de = de1;
-	}
-	de->file_type = EXT4_FT_UNKNOWN;
-	de->inode = cpu_to_le32(inode->i_ino);
-	ext4_set_de_type(inode->i_sb, de, inode->i_mode);
-	de->name_len = namelen;
-	memcpy(de->name, name, namelen);
-}
 /*
  * Add a new entry into a directory (leaf) block.  If de is non-NULL,
  * it points to a directory entry which is guaranteed to be large
@@ -1698,10 +1608,12 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
+	unsigned int	offset = 0;
 	unsigned int	blocksize = dir->i_sb->s_blocksize;
 	unsigned short	reclen;
+	int		nlen, rlen, err;
+	char		*top;
 	int		csum_size = 0;
-	int		err;
 
 	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1709,11 +1621,22 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	reclen = EXT4_DIR_REC_LEN(namelen);
 	if (!de) {
-		err = ext4_find_dest_de(dir, inode,
-					bh, bh->b_data, blocksize - csum_size,
-					name, namelen, &de);
-		if (err)
-			return err;
+		de = (struct ext4_dir_entry_2 *)bh->b_data;
+		top = bh->b_data + (blocksize - csum_size) - reclen;
+		while ((char *) de <= top) {
+			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+				return -EIO;
+			if (ext4_match(namelen, name, de))
+				return -EEXIST;
+			nlen = EXT4_DIR_REC_LEN(de->name_len);
+			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+			if ((de->inode? rlen - nlen: rlen) >= reclen)
+				break;
+			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+			offset += rlen;
+		}
+		if ((char *) de > top)
+			return -ENOSPC;
 	}
 	BUFFER_TRACE(bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, bh);
@@ -1723,8 +1646,19 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	}
 
 	/* By now the buffer is marked for journaling */
-	ext4_insert_dentry(inode, de, blocksize, name, namelen);
-
+	nlen = EXT4_DIR_REC_LEN(de->name_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+	if (de->inode) {
+		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
+		de = de1;
+	}
+	de->file_type = EXT4_FT_UNKNOWN;
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext4_set_de_type(dir->i_sb, de, inode->i_mode);
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
 	/*
 	 * XXX shouldn't update any times until successful
 	 * completion of syscall, but too many callers depend
@@ -1897,17 +1831,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	blocksize = sb->s_blocksize;
 	if (!dentry->d_name.len)
 		return -EINVAL;
-
-	if (ext4_has_inline_data(dir)) {
-		retval = ext4_try_add_inline_entry(handle, dentry, inode);
-		if (retval < 0)
-			return retval;
-		if (retval == 1) {
-			retval = 0;
-			return retval;
-		}
-	}
-
 	if (is_dx(dir)) {
 		retval = ext4_dx_add_entry(handle, dentry, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -2113,29 +2036,36 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 }
 
 /*
- * ext4_generic_delete_entry deletes a directory entry by merging it
- * with the previous entry
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
  */
-int ext4_generic_delete_entry(handle_t *handle,
-			      struct inode *dir,
-			      struct ext4_dir_entry_2 *de_del,
-			      struct buffer_head *bh,
-			      void *entry_buf,
-			      int buf_size,
-			      int csum_size)
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
 {
 	struct ext4_dir_entry_2 *de, *pde;
 	unsigned int blocksize = dir->i_sb->s_blocksize;
-	int i;
+	int csum_size = 0;
+	int i, err;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	i = 0;
 	pde = NULL;
-	de = (struct ext4_dir_entry_2 *)entry_buf;
-	while (i < buf_size - csum_size) {
-		if (ext4_check_dir_entry(dir, NULL, de, bh,
-					 bh->b_data, bh->b_size, i))
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	while (i < bh->b_size - csum_size) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh, i))
 			return -EIO;
 		if (de == de_del)  {
+			BUFFER_TRACE(bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err)) {
+				ext4_std_error(dir->i_sb, err);
+				return err;
+			}
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
 					ext4_rec_len_from_disk(pde->rec_len,
@@ -2146,6 +2076,12 @@ int ext4_generic_delete_entry(handle_t *handle,
 			else
 				de->inode = 0;
 			dir->i_version++;
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+			if (unlikely(err)) {
+				ext4_std_error(dir->i_sb, err);
+				return err;
+			}
 			return 0;
 		}
 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -2155,48 +2091,6 @@ int ext4_generic_delete_entry(handle_t *handle,
 	return -ENOENT;
 }
 
-static int ext4_delete_entry(handle_t *handle,
-			     struct inode *dir,
-			     struct ext4_dir_entry_2 *de_del,
-			     struct buffer_head *bh)
-{
-	int err, csum_size = 0;
-
-	if (ext4_has_inline_data(dir)) {
-		int has_inline_data = 1;
-		err = ext4_delete_inline_entry(handle, dir, de_del, bh,
-					       &has_inline_data);
-		if (has_inline_data)
-			return err;
-	}
-
-	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
-		csum_size = sizeof(struct ext4_dir_entry_tail);
-
-	BUFFER_TRACE(bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, bh);
-	if (unlikely(err))
-		goto out;
-
-	err = ext4_generic_delete_entry(handle, dir, de_del,
-					bh, bh->b_data,
-					dir->i_sb->s_blocksize, csum_size);
-	if (err)
-		goto out;
-
-	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
-	if (unlikely(err))
-		goto out;
-
-	return 0;
-out:
-	if (err != -ENOENT)
-		ext4_std_error(dir->i_sb, err);
-	return err;
-}
-
 /*
  * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
  * since this indicates that nlinks count was previously 1.
@@ -2317,95 +2211,21 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
-			  struct ext4_dir_entry_2 *de,
-			  int blocksize, int csum_size,
-			  unsigned int parent_ino, int dotdot_real_len)
-{
-	de->inode = cpu_to_le32(inode->i_ino);
-	de->name_len = 1;
-	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
-					   blocksize);
-	strcpy(de->name, ".");
-	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
-
-	de = ext4_next_entry(de, blocksize);
-	de->inode = cpu_to_le32(parent_ino);
-	de->name_len = 2;
-	if (!dotdot_real_len)
-		de->rec_len = ext4_rec_len_to_disk(blocksize -
-					(csum_size + EXT4_DIR_REC_LEN(1)),
-					blocksize);
-	else
-		de->rec_len = ext4_rec_len_to_disk(
-				EXT4_DIR_REC_LEN(de->name_len), blocksize);
-	strcpy(de->name, "..");
-	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
-
-	return ext4_next_entry(de, blocksize);
-}
-
-static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
-			     struct inode *inode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
+	handle_t *handle;
+	struct inode *inode;
 	struct buffer_head *dir_block = NULL;
 	struct ext4_dir_entry_2 *de;
 	struct ext4_dir_entry_tail *t;
 	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int csum_size = 0;
-	int err;
+	int err, retries = 0;
 
 	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
-	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
-		err = ext4_try_create_inline_dir(handle, dir, inode);
-		if (err < 0 && err != -ENOSPC)
-			goto out;
-		if (!err)
-			goto out;
-	}
-
-	inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
-	dir_block = ext4_bread(handle, inode, 0, 1, &err);
-	if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
-		if (!err) {
-			err = -EIO;
-			ext4_error(inode->i_sb,
-				   "Directory hole detected on inode %lu\n",
-				   inode->i_ino);
-		}
-		goto out;
-	}
-	BUFFER_TRACE(dir_block, "get_write_access");
-	err = ext4_journal_get_write_access(handle, dir_block);
-	if (err)
-		goto out;
-	de = (struct ext4_dir_entry_2 *)dir_block->b_data;
-	ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
-	set_nlink(inode, 2);
-	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
-		initialize_dirent_tail(t, blocksize);
-	}
-
-	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
-	if (err)
-		goto out;
-	set_buffer_verified(dir_block);
-out:
-	brelse(dir_block);
-	return err;
-}
-
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	handle_t *handle;
-	struct inode *inode;
-	int err, retries = 0;
-
 	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
@@ -2429,9 +2249,47 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	inode->i_op = &ext4_dir_inode_operations;
 	inode->i_fop = &ext4_dir_operations;
-	err = ext4_init_new_dir(handle, dir, inode);
+	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
+		if (!err) {
+			err = -EIO;
+			ext4_error(inode->i_sb,
+				   "Directory hole detected on inode %lu\n",
+				   inode->i_ino);
+		}
+		goto out_clear_inode;
+	}
+	BUFFER_TRACE(dir_block, "get_write_access");
+	err = ext4_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+					   blocksize);
+	strcpy(de->name, ".");
+	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+	de = ext4_next_entry(de, blocksize);
+	de->inode = cpu_to_le32(dir->i_ino);
+	de->rec_len = ext4_rec_len_to_disk(blocksize -
+					   (csum_size + EXT4_DIR_REC_LEN(1)),
+					   blocksize);
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+	set_nlink(inode, 2);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
 	if (err)
 		goto out_clear_inode;
+	set_buffer_verified(dir_block);
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (!err)
 		err = ext4_add_entry(handle, dentry, inode);
@@ -2451,6 +2309,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
 out_stop:
+	brelse(dir_block);
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
@@ -2468,14 +2327,6 @@ static int empty_dir(struct inode *inode)
 	struct super_block *sb;
 	int err = 0;
 
-	if (ext4_has_inline_data(inode)) {
-		int has_inline_data = 1;
-
-		err = empty_inline_dir(inode, &has_inline_data);
-		if (has_inline_data)
-			return err;
-	}
-
 	sb = inode->i_sb;
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
 	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
@@ -2542,8 +2393,7 @@ static int empty_dir(struct inode *inode)
 			set_buffer_verified(bh);
 			de = (struct ext4_dir_entry_2 *) bh->b_data;
 		}
-		if (ext4_check_dir_entry(inode, NULL, de, bh,
-					 bh->b_data, bh->b_size, offset)) {
+		if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
 			de = (struct ext4_dir_entry_2 *)(bh->b_data +
 							 sb->s_blocksize);
 			offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2729,7 +2579,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2794,7 +2644,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		ext4_handle_sync(handle);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2976,39 +2826,8 @@ static int ext4_link(struct dentry *old_dentry,
 	return err;
 }
 
-
-/*
- * Try to find buffer head where contains the parent block.
- * It should be the inode block if it is inlined or the 1st block
- * if it is a normal dir.
- */
-static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
-					struct inode *inode,
-					int *retval,
-					struct ext4_dir_entry_2 **parent_de,
-					int *inlined)
-{
-	struct buffer_head *bh;
-
-	if (!ext4_has_inline_data(inode)) {
-		if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
-			if (!*retval) {
-				*retval = -EIO;
-				ext4_error(inode->i_sb,
-					   "Directory hole detected on inode %lu\n",
-					   inode->i_ino);
-			}
-			return NULL;
-		}
-		*parent_de = ext4_next_entry(
-					(struct ext4_dir_entry_2 *)bh->b_data,
-					inode->i_sb->s_blocksize);
-		return bh;
-	}
-
-	*inlined = 1;
-	return ext4_get_first_inline_block(inode, parent_de, retval);
-}
+#define PARENT_INO(buffer, size) \
+	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
@@ -3022,8 +2841,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *old_bh, *new_bh, *dir_bh;
 	struct ext4_dir_entry_2 *old_de, *new_de;
 	int retval, force_da_alloc = 0;
-	int inlined = 0, new_inlined = 0;
-	struct ext4_dir_entry_2 *parent_de;
 
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
@@ -3043,7 +2860,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		ext4_handle_sync(handle);
 
-	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
+	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -3056,8 +2873,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
-				 &new_de, &new_inlined);
+	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
 			brelse(new_bh);
@@ -3071,17 +2887,22 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 				goto end_rename;
 		}
 		retval = -EIO;
-		dir_bh = ext4_get_first_dir_block(handle, old_inode,
-						  &retval, &parent_de,
-						  &inlined);
-		if (!dir_bh)
+		if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
+			if (!retval) {
+				retval = -EIO;
+				ext4_error(old_inode->i_sb,
+					   "Directory hole detected on inode %lu\n",
+					   old_inode->i_ino);
+			}
 			goto end_rename;
-		if (!inlined && !buffer_verified(dir_bh) &&
+		}
+		if (!buffer_verified(dir_bh) &&
 		    !ext4_dirent_csum_verify(old_inode,
 				(struct ext4_dir_entry *)dir_bh->b_data))
 			goto end_rename;
 		set_buffer_verified(dir_bh);
-		if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
@@ -3110,13 +2931,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 					ext4_current_time(new_dir);
 		ext4_mark_inode_dirty(handle, new_dir);
 		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-		if (!new_inlined) {
-			retval = ext4_handle_dirty_dirent_node(handle,
-							       new_dir, new_bh);
-			if (unlikely(retval)) {
-				ext4_std_error(new_dir->i_sb, retval);
-				goto end_rename;
-			}
+		retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
+		if (unlikely(retval)) {
+			ext4_std_error(new_dir->i_sb, retval);
+			goto end_rename;
 		}
 		brelse(new_bh);
 		new_bh = NULL;
@@ -3144,8 +2962,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct buffer_head *old_bh2;
 		struct ext4_dir_entry_2 *old_de2;
 
-		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
-					  &old_de2, NULL);
+		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
 		if (old_bh2) {
 			retval = ext4_delete_entry(handle, old_dir,
 						   old_de2, old_bh2);
@@ -3165,19 +2982,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
 	ext4_update_dx_flag(old_dir);
 	if (dir_bh) {
-		parent_de->inode = cpu_to_le32(new_dir->i_ino);
+		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-		if (!inlined) {
-			if (is_dx(old_inode)) {
-				retval = ext4_handle_dirty_dx_node(handle,
-								   old_inode,
-								   dir_bh);
-			} else {
-				retval = ext4_handle_dirty_dirent_node(handle,
-							old_inode, dir_bh);
-			}
+		if (is_dx(old_inode)) {
+			retval = ext4_handle_dirty_dx_node(handle,
+							   old_inode,
+							   dir_bh);
 		} else {
-			retval = ext4_mark_inode_dirty(handle, old_inode);
+			retval = ext4_handle_dirty_dirent_node(handle,
+							       old_inode,
+							       dir_bh);
 		}
 		if (retval) {
 			ext4_std_error(old_dir->i_sb, retval);
@@ -3228,19 +3043,23 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.mknod		= ext4_mknod,
 	.rename		= ext4_rename,
 	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
+#endif
 	.get_acl	= ext4_get_acl,
 	.fiemap         = ext4_fiemap,
 };
 
 const struct inode_operations ext4_special_inode_operations = {
 	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
+#endif
 	.get_acl	= ext4_get_acl,
 };
diff --git a/trunk/fs/ext4/page-io.c b/trunk/fs/ext4/page-io.c
index 0016fbca2a40..68e896e12a67 100644
--- a/trunk/fs/ext4/page-io.c
+++ b/trunk/fs/ext4/page-io.c
@@ -27,6 +27,7 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
 
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
 
@@ -110,7 +111,7 @@ static int ext4_end_io(ext4_io_end_t *io)
 		inode_dio_done(inode);
 	/* Wake up anyone waiting on unwritten extent conversion */
 	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-		wake_up_all(ext4_ioend_wq(inode));
+		wake_up_all(ext4_ioend_wq(io->inode));
 	return ret;
 }
 
diff --git a/trunk/fs/ext4/resize.c b/trunk/fs/ext4/resize.c
index d99387b89edd..47bf06a2765d 100644
--- a/trunk/fs/ext4/resize.c
+++ b/trunk/fs/ext4/resize.c
@@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	err = ext4_journal_get_write_access(handle, gdb_bh);
 	if (unlikely(err))
-		goto exit_dind;
+		goto exit_sbh;
 
 	err = ext4_journal_get_write_access(handle, dind);
 	if (unlikely(err))
@@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	/* ext4_reserve_inode_write() gets a reference on the iloc */
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (unlikely(err))
-		goto exit_dind;
+		goto exit_dindj;
 
 	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
 				     sizeof(struct buffer_head *),
@@ -846,7 +846,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 exit_inode:
 	ext4_kvfree(n_group_desc);
+	/* ext4_handle_release_buffer(handle, iloc.bh); */
 	brelse(iloc.bh);
+exit_dindj:
+	/* ext4_handle_release_buffer(handle, dind); */
+exit_sbh:
+	/* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
 	brelse(dind);
 exit_bh:
@@ -964,8 +969,14 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	for (i = 0; i < reserved_gdb; i++) {
-		if ((err = ext4_journal_get_write_access(handle, primary[i])))
+		if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+			/*
+			int j;
+			for (j = 0; j < i; j++)
+				ext4_handle_release_buffer(handle, primary[j]);
+			 */
 			goto exit_bh;
+		}
 	}
 
 	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c
index 3cdb0a2fc648..80928f716850 100644
--- a/trunk/fs/ext4/super.c
+++ b/trunk/fs/ext4/super.c
@@ -45,7 +45,7 @@
 #include <linux/freezer.h>
 
 #include "ext4.h"
-#include "ext4_extents.h"	/* Needed for trace points definition */
+#include "ext4_extents.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -939,11 +939,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 		return NULL;
 
 	ei->vfs_inode.i_version = 1;
+	ei->vfs_inode.i_data.writeback_index = 0;
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
-	ext4_es_init_tree(&ei->i_es_tree);
-	rwlock_init(&ei->i_es_lock);
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
@@ -997,7 +996,9 @@ static void init_once(void *foo)
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT4_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
+#endif
 	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 }
@@ -1030,7 +1031,6 @@ void ext4_clear_inode(struct inode *inode)
 	clear_inode(inode);
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
-	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
 	if (EXT4_I(inode)->jinode) {
 		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
 					       EXT4_I(inode)->jinode);
@@ -1447,8 +1447,13 @@ static const struct mount_opts {
 	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
 	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
 	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
+#ifdef CONFIG_EXT4_FS_XATTR
 	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
 	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#else
+	{Opt_user_xattr, 0, MOPT_NOSUPPORT},
+	{Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
+#endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
 	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
@@ -3197,6 +3202,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	ext4_fsblk_t overhead = 0;
 	char *buf = (char *) get_zeroed_page(GFP_KERNEL);
 
+	memset(buf, 0, PAGE_SIZE);
 	if (!buf)
 		return -ENOMEM;
 
@@ -3250,7 +3256,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int i;
 	int needs_recovery, has_huge_files, has_bigalloc;
 	__u64 blocks_count;
-	int err = 0;
+	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	ext4_group_t first_not_zeroed;
 
@@ -3266,6 +3272,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb = sb;
+	sbi->s_mount_opt = 0;
+	sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
+	sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
 	if (sb->s_bdev->bd_part)
@@ -3276,7 +3285,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
 		*cp = '!';
 
-	/* -EINVAL is default */
 	ret = -EINVAL;
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
@@ -3361,7 +3369,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sb, NO_UID32);
 	/* xattr user namespace & acls are now defaulted on */
+#ifdef CONFIG_EXT4_FS_XATTR
 	set_opt(sb, XATTR_USER);
+#endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
@@ -3652,6 +3662,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+		ret = err;
 		goto failed_mount;
 	}
 
@@ -3759,6 +3770,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		ret = err;
 		goto failed_mount3;
 	}
 
@@ -3789,6 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
+	sbi->s_resize_flags = 0;
 
 	sb->s_root = NULL;
 
@@ -3884,8 +3897,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (es->s_overhead_clusters)
 		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
 	else {
-		err = ext4_calculate_overhead(sb);
-		if (err)
+		ret = ext4_calculate_overhead(sb);
+		if (ret)
 			goto failed_mount_wq;
 	}
 
@@ -3897,7 +3910,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
-		ret = -ENOMEM;
 		goto failed_mount_wq;
 	}
 
@@ -4000,20 +4012,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/* Enable quota usage during mount. */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 	    !(sb->s_flags & MS_RDONLY)) {
-		err = ext4_enable_quotas(sb);
-		if (err)
+		ret = ext4_enable_quotas(sb);
+		if (ret)
 			goto failed_mount7;
 	}
 #endif  /* CONFIG_QUOTA */
 
-	if (test_opt(sb, DISCARD)) {
-		struct request_queue *q = bdev_get_queue(sb->s_bdev);
-		if (!blk_queue_discard(q))
-			ext4_msg(sb, KERN_WARNING,
-				 "mounting with \"discard\" option, but "
-				 "the device does not support discard");
-	}
-
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
 		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
 		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -4080,7 +4084,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	kfree(sbi);
 out_free_orig:
 	kfree(orig_data);
-	return err ? err : ret;
+	return ret;
 }
 
 /*
@@ -4786,7 +4790,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
+	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead);
 	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
 		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
@@ -5278,7 +5282,6 @@ static int __init ext4_init_fs(void)
 	ext4_li_info = NULL;
 	mutex_init(&ext4_li_mtx);
 
-	/* Build-time check for flags consistency */
 	ext4_check_flag_values();
 
 	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5286,14 +5289,9 @@ static int __init ext4_init_fs(void)
 		init_waitqueue_head(&ext4__ioend_wq[i]);
 	}
 
-	err = ext4_init_es();
-	if (err)
-		return err;
-
 	err = ext4_init_pageio();
 	if (err)
-		goto out7;
-
+		return err;
 	err = ext4_init_system_zone();
 	if (err)
 		goto out6;
@@ -5343,9 +5341,6 @@ static int __init ext4_init_fs(void)
 	ext4_exit_system_zone();
 out6:
 	ext4_exit_pageio();
-out7:
-	ext4_exit_es();
-
 	return err;
 }
 
diff --git a/trunk/fs/ext4/symlink.c b/trunk/fs/ext4/symlink.c
index ff3711932018..ed9354aff279 100644
--- a/trunk/fs/ext4/symlink.c
+++ b/trunk/fs/ext4/symlink.c
@@ -35,18 +35,22 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
+#endif
 };
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
 	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
+#endif
 };
diff --git a/trunk/fs/ext4/xattr.c b/trunk/fs/ext4/xattr.c
index 3a91ebc2b66f..2cdb98d62980 100644
--- a/trunk/fs/ext4/xattr.c
+++ b/trunk/fs/ext4/xattr.c
@@ -61,6 +61,11 @@
 #include "xattr.h"
 #include "acl.h"
 
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
 #ifdef EXT4_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
 		printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -307,7 +312,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	return error;
 }
 
-int
+static int
 ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 		     void *buffer, size_t buffer_size)
 {
@@ -576,6 +581,21 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 	return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
 
+struct ext4_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext4_xattr_search {
+	struct ext4_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext4_xattr_entry *here;
+	int not_found;
+};
+
 static int
 ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 {
@@ -628,14 +648,9 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 				   size. Just replace. */
 				s->here->e_value_size =
 					cpu_to_le32(i->value_len);
-				if (i->value == EXT4_ZERO_XATTR_VALUE) {
-					memset(val, 0, size);
-				} else {
-					/* Clear pad bytes first. */
-					memset(val + size - EXT4_XATTR_PAD, 0,
-					       EXT4_XATTR_PAD);
-					memcpy(val, i->value, i->value_len);
-				}
+				memset(val + size - EXT4_XATTR_PAD, 0,
+				       EXT4_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, i->value, i->value_len);
 				return 0;
 			}
 
@@ -674,14 +689,9 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
 			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			if (i->value == EXT4_ZERO_XATTR_VALUE) {
-				memset(val, 0, size);
-			} else {
-				/* Clear the pad bytes first. */
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD);
-				memcpy(val, i->value, i->value_len);
-			}
+			memset(val + size - EXT4_XATTR_PAD, 0,
+			       EXT4_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, i->value, i->value_len);
 		}
 	}
 	return 0;
@@ -784,6 +794,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			int offset = (char *)s->here - bs->bh->b_data;
 
 			unlock_buffer(bs->bh);
+			ext4_handle_release_buffer(handle, bs->bh);
 			if (ce) {
 				mb_cache_entry_release(ce);
 				ce = NULL;
@@ -939,8 +950,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 #undef header
 }
 
-int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
-			  struct ext4_xattr_ibody_find *is)
+struct ext4_xattr_ibody_find {
+	struct ext4_xattr_search s;
+	struct ext4_iloc iloc;
+};
+
+static int
+ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+		      struct ext4_xattr_ibody_find *is)
 {
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
@@ -968,47 +985,10 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	return 0;
 }
 
-int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
-				struct ext4_xattr_info *i,
-				struct ext4_xattr_ibody_find *is)
-{
-	struct ext4_xattr_ibody_header *header;
-	struct ext4_xattr_search *s = &is->s;
-	int error;
-
-	if (EXT4_I(inode)->i_extra_isize == 0)
-		return -ENOSPC;
-	error = ext4_xattr_set_entry(i, s);
-	if (error) {
-		if (error == -ENOSPC &&
-		    ext4_has_inline_data(inode)) {
-			error = ext4_try_to_evict_inline_data(handle, inode,
-					EXT4_XATTR_LEN(strlen(i->name) +
-					EXT4_XATTR_SIZE(i->value_len)));
-			if (error)
-				return error;
-			error = ext4_xattr_ibody_find(inode, i, is);
-			if (error)
-				return error;
-			error = ext4_xattr_set_entry(i, s);
-		}
-		if (error)
-			return error;
-	}
-	header = IHDR(inode, ext4_raw_inode(&is->iloc));
-	if (!IS_LAST_ENTRY(s->first)) {
-		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
-		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
-	} else {
-		header->h_magic = cpu_to_le32(0);
-		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
-	}
-	return 0;
-}
-
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
-				struct ext4_xattr_info *i,
-				struct ext4_xattr_ibody_find *is)
+static int
+ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+		     struct ext4_xattr_info *i,
+		     struct ext4_xattr_ibody_find *is)
 {
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_xattr_search *s = &is->s;
@@ -1164,17 +1144,9 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 {
 	handle_t *handle;
 	int error, retries = 0;
-	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
 
 retry:
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * So reserve the journal space first.
-	 */
-	if (ext4_has_inline_data(inode))
-		credits += ext4_writepage_trans_blocks(inode) + 1;
-
-	handle = ext4_journal_start(inode, credits);
+	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 	} else {
diff --git a/trunk/fs/ext4/xattr.h b/trunk/fs/ext4/xattr.h
index 69eda787a96a..91f31ca7d9af 100644
--- a/trunk/fs/ext4/xattr.h
+++ b/trunk/fs/ext4/xattr.h
@@ -21,7 +21,6 @@
 #define EXT4_XATTR_INDEX_TRUSTED		4
 #define	EXT4_XATTR_INDEX_LUSTRE			5
 #define EXT4_XATTR_INDEX_SECURITY	        6
-#define EXT4_XATTR_INDEX_SYSTEM			7
 
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */
@@ -66,32 +65,7 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
-#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
-
-struct ext4_xattr_info {
-	int name_index;
-	const char *name;
-	const void *value;
-	size_t value_len;
-};
-
-struct ext4_xattr_search {
-	struct ext4_xattr_entry *first;
-	void *base;
-	void *end;
-	struct ext4_xattr_entry *here;
-	int not_found;
-};
-
-struct ext4_xattr_ibody_find {
-	struct ext4_xattr_search s;
-	struct ext4_iloc iloc;
-};
+# ifdef CONFIG_EXT4_FS_XATTR
 
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
@@ -116,82 +90,60 @@ extern void ext4_exit_xattr(void);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
-extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
-				 struct ext4_xattr_ibody_find *is);
-extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
-				const char *name,
-				void *buffer, size_t buffer_size);
-extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
-				       struct ext4_xattr_info *i,
-				       struct ext4_xattr_ibody_find *is);
-
-extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
-extern int ext4_get_max_inline_size(struct inode *inode);
-extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
-				   struct ext4_iloc *iloc,
-				   void *buffer, loff_t pos,
-				   unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
-				    unsigned int len);
-extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
-				 unsigned int len);
-extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
-
-extern int ext4_readpage_inline(struct inode *inode, struct page *page);
-extern int ext4_try_to_write_inline_data(struct address_space *mapping,
-					 struct inode *inode,
-					 loff_t pos, unsigned len,
-					 unsigned flags,
-					 struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
-				      loff_t pos, unsigned len,
-				      unsigned copied,
-				      struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
-				  unsigned len,
-				  struct page *page);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
-					   struct inode *inode,
-					   loff_t pos, unsigned len,
-					   unsigned flags,
-					   struct page **pagep,
-					   void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
-					 unsigned len, unsigned copied,
-					 struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
-				     struct inode *inode);
-extern int ext4_try_create_inline_dir(handle_t *handle,
-				      struct inode *parent,
-				      struct inode *inode);
-extern int ext4_read_inline_dir(struct file *filp,
-				void *dirent, filldir_t filldir,
-				int *has_inline_data);
-extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
-					const struct qstr *d_name,
-					struct ext4_dir_entry_2 **res_dir,
-					int *has_inline_data);
-extern int ext4_delete_inline_entry(handle_t *handle,
-				    struct inode *dir,
-				    struct ext4_dir_entry_2 *de_del,
-				    struct buffer_head *bh,
-				    int *has_inline_data);
-extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
-extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
-					struct ext4_dir_entry_2 **parent_de,
-					int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
-				   struct fiemap_extent_info *fieinfo,
-				   int *has_inline);
-extern int ext4_try_to_evict_inline_data(handle_t *handle,
-					 struct inode *inode,
-					 int needed);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
-
-extern int ext4_convert_inline_data(struct inode *inode);
+# else  /* CONFIG_EXT4_FS_XATTR */
+
+static inline int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+	       const char *name, const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext4_xattr_put_super(struct super_block *sb)
+{
+}
+
+static __init inline int
+ext4_init_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+ext4_exit_xattr(void)
+{
+}
+
+static inline int
+ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle)
+{
+	return -EOPNOTSUPP;
+}
+
+#define ext4_xattr_handlers	NULL
+
+# endif  /* CONFIG_EXT4_FS_XATTR */
 
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
diff --git a/trunk/fs/jbd2/journal.c b/trunk/fs/jbd2/journal.c
index dbf41f9452db..484b8d1c6cb6 100644
--- a/trunk/fs/jbd2/journal.c
+++ b/trunk/fs/jbd2/journal.c
@@ -60,6 +60,7 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
 EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
+EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
 #if 0
 EXPORT_SYMBOL(journal_sync_buffer);
diff --git a/trunk/fs/jbd2/transaction.c b/trunk/fs/jbd2/transaction.c
index 42f6615af0ac..d8da40e99d84 100644
--- a/trunk/fs/jbd2/transaction.c
+++ b/trunk/fs/jbd2/transaction.c
@@ -1207,6 +1207,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	return ret;
 }
 
+/*
+ * jbd2_journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUFFER_TRACE(bh, "entry");
+}
+
 /**
  * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
  * @handle: transaction handle
diff --git a/trunk/fs/nfs/idmap.c b/trunk/fs/nfs/idmap.c
index bc3968fa81e5..9cc4a3fbf4b0 100644
--- a/trunk/fs/nfs/idmap.c
+++ b/trunk/fs/nfs/idmap.c
@@ -193,15 +193,19 @@ static int nfs_idmap_init_keyring(void)
 	if (!cred)
 		return -ENOMEM;
 
-	keyring = keyring_alloc(".id_resolver", 0, 0, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+			     (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			     KEY_USR_VIEW | KEY_USR_READ,
+			     KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
 	}
 
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
 	ret = register_key_type(&key_type_id_resolver);
 	if (ret < 0)
 		goto failed_put_key;
diff --git a/trunk/include/asm-generic/pgtable.h b/trunk/include/asm-generic/pgtable.h
index 701beab27aab..284e80831d2c 100644
--- a/trunk/include/asm-generic/pgtable.h
+++ b/trunk/include/asm-generic/pgtable.h
@@ -219,10 +219,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 #define move_pte(pte, prot, old_addr, new_addr)	(pte)
 #endif
 
-#ifndef pte_accessible
-# define pte_accessible(pte)		((void)(pte),1)
-#endif
-
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
@@ -584,112 +580,6 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 #endif
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
-/*
- * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
- * same bit too). It's set only when _PAGE_PRESET is not set and it's
- * never set if _PAGE_PRESENT is set.
- *
- * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
- * fault triggers on those regions if pte/pmd_numa returns true
- * (because _PAGE_PRESENT is not set).
- */
-#ifndef pte_numa
-static inline int pte_numa(pte_t pte)
-{
-	return (pte_flags(pte) &
-		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
-}
-#endif
-
-#ifndef pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
-	return (pmd_flags(pmd) &
-		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
-}
-#endif
-
-/*
- * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
- * because they're called by the NUMA hinting minor page fault. If we
- * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
- * would be forced to set it later while filling the TLB after we
- * return to userland. That would trigger a second write to memory
- * that we optimize away by setting _PAGE_ACCESSED here.
- */
-#ifndef pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	pte = pte_clear_flags(pte, _PAGE_NUMA);
-	return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED);
-}
-#endif
-
-#ifndef pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	pmd = pmd_clear_flags(pmd, _PAGE_NUMA);
-	return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED);
-}
-#endif
-
-#ifndef pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	pte = pte_set_flags(pte, _PAGE_NUMA);
-	return pte_clear_flags(pte, _PAGE_PRESENT);
-}
-#endif
-
-#ifndef pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	pmd = pmd_set_flags(pmd, _PAGE_NUMA);
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
-}
-#endif
-#else
-extern int pte_numa(pte_t pte);
-extern int pmd_numa(pmd_t pmd);
-extern pte_t pte_mknonnuma(pte_t pte);
-extern pmd_t pmd_mknonnuma(pmd_t pmd);
-extern pte_t pte_mknuma(pte_t pte);
-extern pmd_t pmd_mknuma(pmd_t pmd);
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
-#else
-static inline int pmd_numa(pmd_t pmd)
-{
-	return 0;
-}
-
-static inline int pte_numa(pte_t pte)
-{
-	return 0;
-}
-
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-	return pmd;
-}
-
-static inline pte_t pte_mknuma(pte_t pte)
-{
-	return pte;
-}
-
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-	return pmd;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/trunk/include/linux/cred.h b/trunk/include/linux/cred.h
index 0142aacb70b7..ebbed2ce6637 100644
--- a/trunk/include/linux/cred.h
+++ b/trunk/include/linux/cred.h
@@ -76,6 +76,21 @@ extern int groups_search(const struct group_info *, kgid_t);
 extern int in_group_p(kgid_t);
 extern int in_egroup_p(kgid_t);
 
+/*
+ * The common credentials for a thread group
+ * - shared by CLONE_THREAD
+ */
+#ifdef CONFIG_KEYS
+struct thread_group_cred {
+	atomic_t	usage;
+	pid_t		tgid;			/* thread group process ID */
+	spinlock_t	lock;
+	struct key __rcu *session_keyring;	/* keyring inherited over fork */
+	struct key	*process_keyring;	/* keyring private to this process */
+	struct rcu_head	rcu;			/* RCU deletion hook */
+};
+#endif
+
 /*
  * The security context of a task
  *
@@ -124,8 +139,6 @@ struct cred {
 #ifdef CONFIG_KEYS
 	unsigned char	jit_keyring;	/* default keyring to attach requested
 					 * keys to */
-	struct key __rcu *session_keyring; /* keyring inherited over fork */
-	struct key	*process_keyring; /* keyring private to this process */
 	struct key	*thread_keyring; /* keyring private to this thread */
 	struct key	*request_key_auth; /* assumed request_key authority */
 	struct thread_group_cred *tgcred; /* thread-group shared credentials */
diff --git a/trunk/include/linux/huge_mm.h b/trunk/include/linux/huge_mm.h
index 1d76f8ca90f0..092dc5305a32 100644
--- a/trunk/include/linux/huge_mm.h
+++ b/trunk/include/linux/huge_mm.h
@@ -31,8 +31,7 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot,
-			int prot_numa);
+			unsigned long addr, pgprot_t newprot);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
@@ -112,7 +111,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
 #define wait_split_huge_page(__anon_vma, __pmd)				\
 	do {								\
 		pmd_t *____pmd = (__pmd);				\
-		anon_vma_lock_write(__anon_vma);			\
+		anon_vma_lock(__anon_vma);				\
 		anon_vma_unlock(__anon_vma);				\
 		BUG_ON(pmd_trans_splitting(*____pmd) ||			\
 		       pmd_trans_huge(*____pmd));			\
@@ -172,10 +171,6 @@ static inline struct page *compound_trans_head(struct page *page)
 	}
 	return page;
 }
-
-extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
-
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -214,13 +209,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
 {
 	return 0;
 }
-
-static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-					unsigned long addr, pmd_t pmd, pmd_t *pmdp)
-{
-	return 0;
-}
-
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h
index 0c80d3f57a5b..3e7fa1acf09c 100644
--- a/trunk/include/linux/hugetlb.h
+++ b/trunk/include/linux/hugetlb.h
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int write);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+void hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 
 #else /* !CONFIG_HUGETLB_PAGE */
@@ -132,11 +132,7 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
 {
 }
 
-static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
-		unsigned long address, unsigned long end, pgprot_t newprot)
-{
-	return 0;
-}
+#define hugetlb_change_protection(vma, address, end, newprot)
 
 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
diff --git a/trunk/include/linux/jbd2.h b/trunk/include/linux/jbd2.h
index 1be23d9fdacb..3efc43f3f162 100644
--- a/trunk/include/linux/jbd2.h
+++ b/trunk/include/linux/jbd2.h
@@ -1096,6 +1096,7 @@ extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
 void		 jbd2_journal_set_triggers(struct buffer_head *,
 					   struct jbd2_buffer_trigger_type *type);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
+extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
 extern void	 journal_sync_buffer (struct buffer_head *);
 extern void	 jbd2_journal_invalidatepage(journal_t *,
@@ -1302,21 +1303,15 @@ static inline int jbd_space_needed(journal_t *journal)
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
-/* JBD uses a CRC32 checksum */
-#define JBD_MAX_CHECKSUM_SIZE 4
-
 static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
 			      const void *address, unsigned int length)
 {
 	struct {
 		struct shash_desc shash;
-		char ctx[JBD_MAX_CHECKSUM_SIZE];
+		char ctx[crypto_shash_descsize(journal->j_chksum_driver)];
 	} desc;
 	int err;
 
-	BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
-		JBD_MAX_CHECKSUM_SIZE);
-
 	desc.shash.tfm = journal->j_chksum_driver;
 	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
diff --git a/trunk/include/linux/key.h b/trunk/include/linux/key.h
index 4dfde1161c5e..2393b1c040b6 100644
--- a/trunk/include/linux/key.h
+++ b/trunk/include/linux/key.h
@@ -265,7 +265,6 @@ extern int key_unlink(struct key *keyring,
 
 extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 				 const struct cred *cred,
-				 key_perm_t perm,
 				 unsigned long flags,
 				 struct key *dest);
 
diff --git a/trunk/include/linux/mempolicy.h b/trunk/include/linux/mempolicy.h
index 9adc270de7ef..dbd212723b74 100644
--- a/trunk/include/linux/mempolicy.h
+++ b/trunk/include/linux/mempolicy.h
@@ -188,8 +188,6 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 	return 1;
 }
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
-
 #else
 
 struct mempolicy {};
@@ -309,11 +307,5 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
 	return 0;
 }
 
-static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
-				 unsigned long address)
-{
-	return -1; /* no node preference */
-}
-
 #endif /* CONFIG_NUMA */
 #endif
diff --git a/trunk/include/linux/migrate.h b/trunk/include/linux/migrate.h
index 1e9f627967a3..0b5865c61efd 100644
--- a/trunk/include/linux/migrate.h
+++ b/trunk/include/linux/migrate.h
@@ -23,15 +23,6 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 #define MIGRATEPAGE_BALLOON_SUCCESS	1 /* special ret code for balloon page
 					   * sucessful migration case.
 					   */
-enum migrate_reason {
-	MR_COMPACTION,
-	MR_MEMORY_FAILURE,
-	MR_MEMORY_HOTPLUG,
-	MR_SYSCALL,		/* also applies to cpusets */
-	MR_MEMPOLICY_MBIND,
-	MR_NUMA_MISPLACED,
-	MR_CMA
-};
 
 #ifdef CONFIG_MIGRATION
 
@@ -41,7 +32,7 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, bool offlining,
-			enum migrate_mode mode, int reason);
+			enum migrate_mode mode);
 extern int migrate_huge_page(struct page *, new_page_t x,
 			unsigned long private, bool offlining,
 			enum migrate_mode mode);
@@ -63,7 +54,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, bool offlining,
-		enum migrate_mode mode, int reason) { return -ENOSYS; }
+		enum migrate_mode mode) { return -ENOSYS; }
 static inline int migrate_huge_page(struct page *page, new_page_t x,
 		unsigned long private, bool offlining,
 		enum migrate_mode mode) { return -ENOSYS; }
@@ -92,37 +83,4 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #define fail_migrate_page NULL
 
 #endif /* CONFIG_MIGRATION */
-
-#ifdef CONFIG_NUMA_BALANCING
-extern int migrate_misplaced_page(struct page *page, int node);
-extern int migrate_misplaced_page(struct page *page, int node);
-extern bool migrate_ratelimited(int node);
-#else
-static inline int migrate_misplaced_page(struct page *page, int node)
-{
-	return -EAGAIN; /* can't migrate now */
-}
-static inline bool migrate_ratelimited(int node)
-{
-	return false;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, pmd_t entry,
-			unsigned long address,
-			struct page *page, int node);
-#else
-static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, pmd_t entry,
-			unsigned long address,
-			struct page *page, int node)
-{
-	return -EAGAIN;
-}
-#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
-
 #endif /* _LINUX_MIGRATE_H */
diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h
index 7f4f906190bd..4af4f0b1be4c 100644
--- a/trunk/include/linux/mm.h
+++ b/trunk/include/linux/mm.h
@@ -693,36 +693,6 @@ static inline int page_to_nid(const struct page *page)
 }
 #endif
 
-#ifdef CONFIG_NUMA_BALANCING
-static inline int page_xchg_last_nid(struct page *page, int nid)
-{
-	return xchg(&page->_last_nid, nid);
-}
-
-static inline int page_last_nid(struct page *page)
-{
-	return page->_last_nid;
-}
-static inline void reset_page_last_nid(struct page *page)
-{
-	page->_last_nid = -1;
-}
-#else
-static inline int page_xchg_last_nid(struct page *page, int nid)
-{
-	return page_to_nid(page);
-}
-
-static inline int page_last_nid(struct page *page)
-{
-	return page_to_nid(page);
-}
-
-static inline void reset_page_last_nid(struct page *page)
-{
-}
-#endif
-
 static inline struct zone *page_zone(const struct page *page)
 {
 	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
@@ -1108,9 +1078,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
-extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
-			      unsigned long end, pgprot_t newprot,
-			      int dirty_accountable, int prot_numa);
 extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
@@ -1612,11 +1579,6 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 #endif
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
-unsigned long change_prot_numa(struct vm_area_struct *vma,
-			unsigned long start, unsigned long end);
-#endif
-
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
@@ -1638,7 +1600,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
-#define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/trunk/include/linux/mm_types.h b/trunk/include/linux/mm_types.h
index 7d9ebb7cc982..7ade2731b5d6 100644
--- a/trunk/include/linux/mm_types.h
+++ b/trunk/include/linux/mm_types.h
@@ -175,10 +175,6 @@ struct page {
 	 */
 	void *shadow;
 #endif
-
-#ifdef CONFIG_NUMA_BALANCING
-	int _last_nid;
-#endif
 }
 /*
  * The struct page can be forced to be double word aligned so that atomic ops
@@ -414,37 +410,10 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	struct cpumask cpumask_allocation;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-	/*
-	 * numa_next_scan is the next time when the PTEs will me marked
-	 * pte_numa to gather statistics and migrate pages to new nodes
-	 * if necessary
-	 */
-	unsigned long numa_next_scan;
-
-	/* numa_next_reset is when the PTE scanner period will be reset */
-	unsigned long numa_next_reset;
-
-	/* Restart point for scanning and setting pte_numa */
-	unsigned long numa_scan_offset;
-
-	/* numa_scan_seq prevents two threads setting pte_numa */
-	int numa_scan_seq;
-
-	/*
-	 * The first node a task was scheduled on. If a task runs on
-	 * a different node than Make PTE Scan Go Now.
-	 */
-	int first_nid;
 #endif
 	struct uprobes_state uprobes_state;
 };
 
-/* first nid will either be a valid NID or one of these values */
-#define NUMA_PTE_SCAN_INIT	-1
-#define NUMA_PTE_SCAN_ACTIVE	-2
-
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/trunk/include/linux/mmzone.h b/trunk/include/linux/mmzone.h
index 4bec5be82cab..cd55dad56aac 100644
--- a/trunk/include/linux/mmzone.h
+++ b/trunk/include/linux/mmzone.h
@@ -735,19 +735,6 @@ typedef struct pglist_data {
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
-#ifdef CONFIG_NUMA_BALANCING
-	/*
-	 * Lock serializing the per destination node AutoNUMA memory
-	 * migration rate limiting data.
-	 */
-	spinlock_t numabalancing_migrate_lock;
-
-	/* Rate limiting time interval */
-	unsigned long numabalancing_migrate_next_window;
-
-	/* Number of pages migrated during the rate limiting time interval */
-	unsigned long numabalancing_migrate_nr_pages;
-#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/trunk/include/linux/rmap.h b/trunk/include/linux/rmap.h
index c20635c527a9..bfe1f4780644 100644
--- a/trunk/include/linux/rmap.h
+++ b/trunk/include/linux/rmap.h
@@ -7,7 +7,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
 #include <linux/memcontrol.h>
 
 /*
@@ -25,8 +25,8 @@
  * pointing to this anon_vma once its vma list is empty.
  */
 struct anon_vma {
-	struct anon_vma *root;		/* Root of this anon_vma tree */
-	struct rw_semaphore rwsem;	/* W: modification, R: walking the list */
+	struct anon_vma *root;	/* Root of this anon_vma tree */
+	struct mutex mutex;	/* Serialize access to vma list */
 	/*
 	 * The refcount is taken on an anon_vma when there is no
 	 * guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
 	struct vm_area_struct *vma;
 	struct anon_vma *anon_vma;
 	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-	struct rb_node rb;			/* locked by anon_vma->rwsem */
+	struct rb_node rb;			/* locked by anon_vma->mutex */
 	unsigned long rb_subtree_last;
 #ifdef CONFIG_DEBUG_VM_RB
 	unsigned long cached_vma_start, cached_vma_last;
@@ -108,37 +108,26 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		down_write(&anon_vma->root->rwsem);
+		mutex_lock(&anon_vma->root->mutex);
 }
 
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		up_write(&anon_vma->root->rwsem);
+		mutex_unlock(&anon_vma->root->mutex);
 }
 
-static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
+static inline void anon_vma_lock(struct anon_vma *anon_vma)
 {
-	down_write(&anon_vma->root->rwsem);
+	mutex_lock(&anon_vma->root->mutex);
 }
 
 static inline void anon_vma_unlock(struct anon_vma *anon_vma)
 {
-	up_write(&anon_vma->root->rwsem);
+	mutex_unlock(&anon_vma->root->mutex);
 }
 
-static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
-{
-	down_read(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
-{
-	up_read(&anon_vma->root->rwsem);
-}
-
-
 /*
  * anon_vma helper functions.
  */
@@ -231,8 +220,8 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma_read(struct page *page);
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
+struct anon_vma *page_lock_anon_vma(struct page *page);
+void page_unlock_anon_vma(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 /*
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index b089c92c609b..2c2f3072beef 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -1527,14 +1527,6 @@ struct task_struct {
 	short il_next;
 	short pref_node_fork;
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-	int numa_scan_seq;
-	int numa_migrate_seq;
-	unsigned int numa_scan_period;
-	u64 node_stamp;			/* migration stamp  */
-	struct callback_head numa_work;
-#endif /* CONFIG_NUMA_BALANCING */
-
 	struct rcu_head rcu;
 
 	/*
@@ -1609,18 +1601,6 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
-#ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages, bool migrated);
-extern void set_numabalancing_state(bool enabled);
-#else
-static inline void task_numa_fault(int node, int pages, bool migrated)
-{
-}
-static inline void set_numabalancing_state(bool enabled)
-{
-}
-#endif
-
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2050,13 +2030,6 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
-extern unsigned int sysctl_numa_balancing_scan_delay;
-extern unsigned int sysctl_numa_balancing_scan_period_min;
-extern unsigned int sysctl_numa_balancing_scan_period_max;
-extern unsigned int sysctl_numa_balancing_scan_period_reset;
-extern unsigned int sysctl_numa_balancing_scan_size;
-extern unsigned int sysctl_numa_balancing_settle_count;
-
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
diff --git a/trunk/include/linux/swiotlb.h b/trunk/include/linux/swiotlb.h
index 071d62c214a6..8d08b3ed406d 100644
--- a/trunk/include/linux/swiotlb.h
+++ b/trunk/include/linux/swiotlb.h
@@ -34,25 +34,21 @@ enum dma_sync_target {
 	SYNC_FOR_CPU = 0,
 	SYNC_FOR_DEVICE = 1,
 };
+extern void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
+				    phys_addr_t phys, size_t size,
+				    enum dma_data_direction dir);
 
-/* define the last possible byte of physical address space as a mapping error */
-#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0)
-
-extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-					  dma_addr_t tbl_dma_addr,
-					  phys_addr_t phys, size_t size,
-					  enum dma_data_direction dir);
-
-extern void swiotlb_tbl_unmap_single(struct device *hwdev,
-				     phys_addr_t tlb_addr,
+extern void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr,
 				     size_t size, enum dma_data_direction dir);
 
-extern void swiotlb_tbl_sync_single(struct device *hwdev,
-				    phys_addr_t tlb_addr,
+extern void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr,
 				    size_t size, enum dma_data_direction dir,
 				    enum dma_sync_target target);
 
 /* Accessory functions. */
+extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+			   enum dma_data_direction dir);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/trunk/include/linux/vm_event_item.h b/trunk/include/linux/vm_event_item.h
index fce0a2799d43..fe786f07d2bd 100644
--- a/trunk/include/linux/vm_event_item.h
+++ b/trunk/include/linux/vm_event_item.h
@@ -38,18 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
 		KSWAPD_SKIP_CONGESTION_WAIT,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
-#ifdef CONFIG_NUMA_BALANCING
-		NUMA_PTE_UPDATES,
-		NUMA_HINT_FAULTS,
-		NUMA_HINT_FAULTS_LOCAL,
-		NUMA_PAGE_MIGRATE,
-#endif
-#ifdef CONFIG_MIGRATION
-		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
-#endif
 #ifdef CONFIG_COMPACTION
-		COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
-		COMPACTISOLATED,
+		COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/trunk/include/linux/vmstat.h b/trunk/include/linux/vmstat.h
index a13291f7da88..92a86b2cce33 100644
--- a/trunk/include/linux/vmstat.h
+++ b/trunk/include/linux/vmstat.h
@@ -80,14 +80,6 @@ static inline void vm_events_fold_cpu(int cpu)
 
 #endif /* CONFIG_VM_EVENT_COUNTERS */
 
-#ifdef CONFIG_NUMA_BALANCING
-#define count_vm_numa_event(x)     count_vm_event(x)
-#define count_vm_numa_events(x, y) count_vm_events(x, y)
-#else
-#define count_vm_numa_event(x) do {} while (0)
-#define count_vm_numa_events(x, y) do {} while (0)
-#endif /* CONFIG_NUMA_BALANCING */
-
 #define __count_zone_vm_events(item, zone, delta) \
 		__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
 		zone_idx(zone), delta)
diff --git a/trunk/include/trace/events/ext4.h b/trunk/include/trace/events/ext4.h
index f6372b011366..d49b285385e8 100644
--- a/trunk/include/trace/events/ext4.h
+++ b/trunk/include/trace/events/ext4.h
@@ -15,7 +15,6 @@ struct ext4_inode_info;
 struct mpage_da_data;
 struct ext4_map_blocks;
 struct ext4_extent;
-struct extent_status;
 
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
@@ -1520,9 +1519,10 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
 );
 
 DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
+		 ext4_fsblk_t pblk, unsigned int len, int ret),
 
-	TP_ARGS(inode, map, ret),
+	TP_ARGS(inode, lblk, pblk, len, ret),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,		dev		)
@@ -1530,37 +1530,37 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
 		__field(	ext4_fsblk_t,	pblk		)
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	unsigned int,	len		)
-		__field(	unsigned int,	flags		)
 		__field(	int,		ret		)
 	),
 
 	TP_fast_assign(
 		__entry->dev    = inode->i_sb->s_dev;
 		__entry->ino    = inode->i_ino;
-		__entry->pblk	= map->m_pblk;
-		__entry->lblk	= map->m_lblk;
-		__entry->len	= map->m_len;
-		__entry->flags	= map->m_flags;
+		__entry->pblk	= pblk;
+		__entry->lblk	= lblk;
+		__entry->len	= len;
 		__entry->ret	= ret;
 	),
 
-	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d",
+	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->pblk,
-		  __entry->len, __entry->flags, __entry->ret)
+		  __entry->len, __entry->ret)
 );
 
 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
+		 ext4_fsblk_t pblk, unsigned len, int ret),
 
-	TP_ARGS(inode, map, ret)
+	TP_ARGS(inode, lblk, pblk, len, ret)
 );
 
 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
+		 ext4_fsblk_t pblk, unsigned len, int ret),
 
-	TP_ARGS(inode, map, ret)
+	TP_ARGS(inode, lblk, pblk, len, ret)
 );
 
 TRACE_EVENT(ext4_ext_load_extent,
@@ -1680,10 +1680,10 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free,
 );
 
 TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
 		 unsigned int allocated, ext4_fsblk_t newblock),
 
-	TP_ARGS(inode, map, flags, allocated, newblock),
+	TP_ARGS(inode, map, allocated, newblock),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,		dev		)
@@ -1699,7 +1699,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
-		__entry->flags		= flags;
+		__entry->flags		= map->m_flags;
 		__entry->lblk		= map->m_lblk;
 		__entry->pblk		= map->m_pblk;
 		__entry->len		= map->m_len;
@@ -1707,7 +1707,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
 		__entry->newblk		= newblock;
 	),
 
-	TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x "
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d"
 		  "allocated %d newblock %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
@@ -2055,106 +2055,6 @@ TRACE_EVENT(ext4_ext_remove_space_done,
 		  (unsigned short) __entry->eh_entries)
 );
 
-TRACE_EVENT(ext4_es_insert_extent,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len),
-
-	TP_ARGS(inode, start, len),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	loff_t,	start			)
-		__field(	loff_t, len			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->start	= start;
-		__entry->len	= len;
-	),
-
-	TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->start, __entry->len)
-);
-
-TRACE_EVENT(ext4_es_remove_extent,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len),
-
-	TP_ARGS(inode, start, len),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	loff_t,	start			)
-		__field(	loff_t,	len			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->start	= start;
-		__entry->len	= len;
-	),
-
-	TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->start, __entry->len)
-);
-
-TRACE_EVENT(ext4_es_find_extent_enter,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start),
-
-	TP_ARGS(inode, start),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,		dev		)
-		__field(	ino_t,		ino		)
-		__field(	ext4_lblk_t,	start		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->start	= start;
-	),
-
-	TP_printk("dev %d,%d ino %lu start %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->start)
-);
-
-TRACE_EVENT(ext4_es_find_extent_exit,
-	TP_PROTO(struct inode *inode, struct extent_status *es,
-		 ext4_lblk_t ret),
-
-	TP_ARGS(inode, es, ret),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,		dev		)
-		__field(	ino_t,		ino		)
-		__field(	ext4_lblk_t,	start		)
-		__field(	ext4_lblk_t,	len		)
-		__field(	ext4_lblk_t,	ret		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->start	= es->start;
-		__entry->len	= es->len;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->start, __entry->len, __entry->ret)
-);
-
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
diff --git a/trunk/include/trace/events/migrate.h b/trunk/include/trace/events/migrate.h
deleted file mode 100644
index ec2a6ccfd7e5..000000000000
--- a/trunk/include/trace/events/migrate.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM migrate
-
-#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_MIGRATE_H
-
-#define MIGRATE_MODE						\
-	{MIGRATE_ASYNC,		"MIGRATE_ASYNC"},		\
-	{MIGRATE_SYNC_LIGHT,	"MIGRATE_SYNC_LIGHT"},		\
-	{MIGRATE_SYNC,		"MIGRATE_SYNC"}		
-
-#define MIGRATE_REASON						\
-	{MR_COMPACTION,		"compaction"},			\
-	{MR_MEMORY_FAILURE,	"memory_failure"},		\
-	{MR_MEMORY_HOTPLUG,	"memory_hotplug"},		\
-	{MR_SYSCALL,		"syscall_or_cpuset"},		\
-	{MR_MEMPOLICY_MBIND,	"mempolicy_mbind"},		\
-	{MR_CMA,		"cma"}
-
-TRACE_EVENT(mm_migrate_pages,
-
-	TP_PROTO(unsigned long succeeded, unsigned long failed,
-		 enum migrate_mode mode, int reason),
-
-	TP_ARGS(succeeded, failed, mode, reason),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,		succeeded)
-		__field(	unsigned long,		failed)
-		__field(	enum migrate_mode,	mode)
-		__field(	int,			reason)
-	),
-
-	TP_fast_assign(
-		__entry->succeeded	= succeeded;
-		__entry->failed		= failed;
-		__entry->mode		= mode;
-		__entry->reason		= reason;
-	),
-
-	TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s",
-		__entry->succeeded,
-		__entry->failed,
-		__print_symbolic(__entry->mode, MIGRATE_MODE),
-		__print_symbolic(__entry->reason, MIGRATE_REASON))
-);
-
-#endif /* _TRACE_MIGRATE_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/trunk/include/uapi/linux/mempolicy.h b/trunk/include/uapi/linux/mempolicy.h
index 0d11c3dcd3a1..23e62e0537e2 100644
--- a/trunk/include/uapi/linux/mempolicy.h
+++ b/trunk/include/uapi/linux/mempolicy.h
@@ -20,7 +20,6 @@ enum {
 	MPOL_PREFERRED,
 	MPOL_BIND,
 	MPOL_INTERLEAVE,
-	MPOL_LOCAL,
 	MPOL_MAX,	/* always last member of enum */
 };
 
@@ -48,15 +47,9 @@ enum mpol_rebind_step {
 
 /* Flags for mbind */
 #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE	 (1<<1)	/* Move pages owned by this process to conform
-				   to policy */
-#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to policy */
-#define MPOL_MF_LAZY	 (1<<3)	/* Modifies '_MOVE:  lazy migrate on fault */
-#define MPOL_MF_INTERNAL (1<<4)	/* Internal flags start here */
-
-#define MPOL_MF_VALID	(MPOL_MF_STRICT   | 	\
-			 MPOL_MF_MOVE     | 	\
-			 MPOL_MF_MOVE_ALL)
+#define MPOL_MF_MOVE	(1<<1)	/* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to mapping */
+#define MPOL_MF_INTERNAL (1<<3)	/* Internal flags start here */
 
 /*
  * Internal flags that share the struct mempolicy flags word with
@@ -66,8 +59,6 @@ enum mpol_rebind_step {
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
-#define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
-#define MPOL_F_MORON	(1 << 4) /* Migrate On pte_numa Reference On Node */
 
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig
index 1a207efca591..2054e048bb98 100644
--- a/trunk/init/Kconfig
+++ b/trunk/init/Kconfig
@@ -717,50 +717,6 @@ config LOG_BUF_SHIFT
 config HAVE_UNSTABLE_SCHED_CLOCK
 	bool
 
-#
-# For architectures that want to enable the support for NUMA-affine scheduler
-# balancing logic:
-#
-config ARCH_SUPPORTS_NUMA_BALANCING
-	bool
-
-# For architectures that (ab)use NUMA to represent different memory regions
-# all cpu-local but of different latencies, such as SuperH.
-#
-config ARCH_WANT_NUMA_VARIABLE_LOCALITY
-	bool
-
-#
-# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
-config ARCH_WANTS_PROT_NUMA_PROT_NONE
-	bool
-
-config ARCH_USES_NUMA_PROT_NONE
-	bool
-	default y
-	depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
-	depends on NUMA_BALANCING
-
-config NUMA_BALANCING_DEFAULT_ENABLED
-	bool "Automatically enable NUMA aware memory/task placement"
-	default y
-	depends on NUMA_BALANCING
-	help
-	  If set, autonumic NUMA balancing will be enabled if running on a NUMA
-	  machine.
-
-config NUMA_BALANCING
-	bool "Memory placement aware NUMA scheduler"
-	depends on ARCH_SUPPORTS_NUMA_BALANCING
-	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
-	depends on SMP && NUMA && MIGRATION
-	help
-	  This option adds support for automatic NUMA aware memory/task placement.
-	  The mechanism is quite primitive and is based on migrating memory when
-	  it is references to the node the task is running on.
-
-	  This system will be inactive on UMA systems.
-
 menuconfig CGROUPS
 	boolean "Control Group support"
 	depends on EVENTFD
diff --git a/trunk/kernel/cred.c b/trunk/kernel/cred.c
index 8888afb846e9..48cea3da6d05 100644
--- a/trunk/kernel/cred.c
+++ b/trunk/kernel/cred.c
@@ -29,6 +29,17 @@
 
 static struct kmem_cache *cred_jar;
 
+/*
+ * The common credentials for the initial task's thread group
+ */
+#ifdef CONFIG_KEYS
+static struct thread_group_cred init_tgcred = {
+	.usage	= ATOMIC_INIT(2),
+	.tgid	= 0,
+	.lock	= __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
+};
+#endif
+
 /*
  * The initial credentials for the initial task
  */
@@ -54,6 +65,9 @@ struct cred init_cred = {
 	.user			= INIT_USER,
 	.user_ns		= &init_user_ns,
 	.group_info		= &init_groups,
+#ifdef CONFIG_KEYS
+	.tgcred			= &init_tgcred,
+#endif
 };
 
 static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -81,6 +95,36 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
 #endif
 }
 
+/*
+ * Dispose of the shared task group credentials
+ */
+#ifdef CONFIG_KEYS
+static void release_tgcred_rcu(struct rcu_head *rcu)
+{
+	struct thread_group_cred *tgcred =
+		container_of(rcu, struct thread_group_cred, rcu);
+
+	BUG_ON(atomic_read(&tgcred->usage) != 0);
+
+	key_put(tgcred->session_keyring);
+	key_put(tgcred->process_keyring);
+	kfree(tgcred);
+}
+#endif
+
+/*
+ * Release a set of thread group credentials.
+ */
+static void release_tgcred(struct cred *cred)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = cred->tgcred;
+
+	if (atomic_dec_and_test(&tgcred->usage))
+		call_rcu(&tgcred->rcu, release_tgcred_rcu);
+#endif
+}
+
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
@@ -106,10 +150,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
 #endif
 
 	security_cred_free(cred);
-	key_put(cred->session_keyring);
-	key_put(cred->process_keyring);
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
+	release_tgcred(cred);
 	if (cred->group_info)
 		put_group_info(cred->group_info);
 	free_uid(cred->user);
@@ -203,6 +246,15 @@ struct cred *cred_alloc_blank(void)
 	if (!new)
 		return NULL;
 
+#ifdef CONFIG_KEYS
+	new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
+	if (!new->tgcred) {
+		kmem_cache_free(cred_jar, new);
+		return NULL;
+	}
+	atomic_set(&new->tgcred->usage, 1);
+#endif
+
 	atomic_set(&new->usage, 1);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	new->magic = CRED_MAGIC;
@@ -256,10 +308,9 @@ struct cred *prepare_creds(void)
 	get_user_ns(new->user_ns);
 
 #ifdef CONFIG_KEYS
-	key_get(new->session_keyring);
-	key_get(new->process_keyring);
 	key_get(new->thread_keyring);
 	key_get(new->request_key_auth);
+	atomic_inc(&new->tgcred->usage);
 #endif
 
 #ifdef CONFIG_SECURITY
@@ -283,20 +334,39 @@ EXPORT_SYMBOL(prepare_creds);
  */
 struct cred *prepare_exec_creds(void)
 {
+	struct thread_group_cred *tgcred = NULL;
 	struct cred *new;
 
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return NULL;
+#endif
+
 	new = prepare_creds();
-	if (!new)
+	if (!new) {
+		kfree(tgcred);
 		return new;
+	}
 
 #ifdef CONFIG_KEYS
 	/* newly exec'd tasks don't get a thread keyring */
 	key_put(new->thread_keyring);
 	new->thread_keyring = NULL;
 
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+
 	/* inherit the session keyring; new process keyring */
-	key_put(new->process_keyring);
-	new->process_keyring = NULL;
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
+
+	release_tgcred(new);
+	new->tgcred = tgcred;
 #endif
 
 	return new;
@@ -313,6 +383,9 @@ struct cred *prepare_exec_creds(void)
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred;
+#endif
 	struct cred *new;
 	int ret;
 
@@ -352,12 +425,22 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 			install_thread_keyring_to_cred(new);
 	}
 
-	/* The process keyring is only shared between the threads in a process;
-	 * anything outside of those threads doesn't inherit.
-	 */
+	/* we share the process and session keyrings between all the threads in
+	 * a process - this is slightly icky as we violate COW credentials a
+	 * bit */
 	if (!(clone_flags & CLONE_THREAD)) {
-		key_put(new->process_keyring);
-		new->process_keyring = NULL;
+		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+		if (!tgcred) {
+			ret = -ENOMEM;
+			goto error_put;
+		}
+		atomic_set(&tgcred->usage, 1);
+		spin_lock_init(&tgcred->lock);
+		tgcred->process_keyring = NULL;
+		tgcred->session_keyring = key_get(new->tgcred->session_keyring);
+
+		release_tgcred(new);
+		new->tgcred = tgcred;
 	}
 #endif
 
@@ -560,6 +643,9 @@ void __init cred_init(void)
  */
 struct cred *prepare_kernel_cred(struct task_struct *daemon)
 {
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred;
+#endif
 	const struct cred *old;
 	struct cred *new;
 
@@ -567,6 +653,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	if (!new)
 		return NULL;
 
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred) {
+		kmem_cache_free(cred_jar, new);
+		return NULL;
+	}
+#endif
+
 	kdebug("prepare_kernel_cred() alloc %p", new);
 
 	if (daemon)
@@ -584,10 +678,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	get_group_info(new->group_info);
 
 #ifdef CONFIG_KEYS
-	new->session_keyring = NULL;
-	new->process_keyring = NULL;
-	new->thread_keyring = NULL;
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+	tgcred->process_keyring = NULL;
+	tgcred->session_keyring = NULL;
+	new->tgcred = tgcred;
 	new->request_key_auth = NULL;
+	new->thread_keyring = NULL;
 	new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
 #endif
 
diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c
index 115d6c2e4cca..3c31e874afad 100644
--- a/trunk/kernel/fork.c
+++ b/trunk/kernel/fork.c
@@ -822,9 +822,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	mm->pmd_huge_pte = NULL;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-	mm->first_nid = NUMA_PTE_SCAN_INIT;
 #endif
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c
index c1fb82104bfb..0533496b6228 100644
--- a/trunk/kernel/sched/core.c
+++ b/trunk/kernel/sched/core.c
@@ -193,10 +193,23 @@ static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* HAVE_JUMP_LABEL */
 
-static int sched_feat_set(char *cmp)
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos)
 {
-	int i;
+	char buf[64];
+	char *cmp;
 	int neg = 0;
+	int i;
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+	cmp = strstrip(buf);
 
 	if (strncmp(cmp, "NO_", 3) == 0) {
 		neg = 1;
@@ -216,27 +229,6 @@ static int sched_feat_set(char *cmp)
 		}
 	}
 
-	return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char buf[64];
-	char *cmp;
-	int i;
-
-	if (cnt > 63)
-		cnt = 63;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-	cmp = strstrip(buf);
-
-	i = sched_feat_set(cmp);
 	if (i == __SCHED_FEAT_NR)
 		return -EINVAL;
 
@@ -1568,40 +1560,7 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
-
-#ifdef CONFIG_NUMA_BALANCING
-	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-		p->mm->numa_next_scan = jiffies;
-		p->mm->numa_next_reset = jiffies;
-		p->mm->numa_scan_seq = 0;
-	}
-
-	p->node_stamp = 0ULL;
-	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
-	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
-	p->numa_work.next = &p->numa_work;
-#endif /* CONFIG_NUMA_BALANCING */
-}
-
-#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
-void set_numabalancing_state(bool enabled)
-{
-	if (enabled)
-		sched_feat_set("NUMA");
-	else
-		sched_feat_set("NO_NUMA");
-}
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
-	numabalancing_enabled = enabled;
 }
-#endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * fork()/clone()-time setup:
diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched/fair.c
index 9af5af979a13..756f9f9e8542 100644
--- a/trunk/kernel/sched/fair.c
+++ b/trunk/kernel/sched/fair.c
@@ -26,9 +26,6 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
-#include <linux/mempolicy.h>
-#include <linux/migrate.h>
-#include <linux/task_work.h>
 
 #include <trace/events/sched.h>
 
@@ -777,227 +774,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * numa task sample period in ms
- */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
-
-/* Portion of address space to scan in MB */
-unsigned int sysctl_numa_balancing_scan_size = 256;
-
-/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
-unsigned int sysctl_numa_balancing_scan_delay = 1000;
-
-static void task_numa_placement(struct task_struct *p)
-{
-	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
-
-	if (p->numa_scan_seq == seq)
-		return;
-	p->numa_scan_seq = seq;
-
-	/* FIXME: Scheduling placement policy hints go here */
-}
-
-/*
- * Got a PROT_NONE fault for a page on @node.
- */
-void task_numa_fault(int node, int pages, bool migrated)
-{
-	struct task_struct *p = current;
-
-	if (!sched_feat_numa(NUMA))
-		return;
-
-	/* FIXME: Allocate task-specific structure for placement policy here */
-
-	/*
-	 * If pages are properly placed (did not migrate) then scan slower.
-	 * This is reset periodically in case of phase changes
-	 */
-        if (!migrated)
-		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
-			p->numa_scan_period + jiffies_to_msecs(10));
-
-	task_numa_placement(p);
-}
-
-static void reset_ptenuma_scan(struct task_struct *p)
-{
-	ACCESS_ONCE(p->mm->numa_scan_seq)++;
-	p->mm->numa_scan_offset = 0;
-}
-
-/*
- * The expensive part of numa migration is done from task_work context.
- * Triggered from task_tick_numa().
- */
-void task_numa_work(struct callback_head *work)
-{
-	unsigned long migrate, next_scan, now = jiffies;
-	struct task_struct *p = current;
-	struct mm_struct *mm = p->mm;
-	struct vm_area_struct *vma;
-	unsigned long start, end;
-	long pages;
-
-	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
-
-	work->next = work; /* protect against double add */
-	/*
-	 * Who cares about NUMA placement when they're dying.
-	 *
-	 * NOTE: make sure not to dereference p->mm before this check,
-	 * exit_task_work() happens _after_ exit_mm() so we could be called
-	 * without p->mm even though we still had it when we enqueued this
-	 * work.
-	 */
-	if (p->flags & PF_EXITING)
-		return;
-
-	/*
-	 * We do not care about task placement until a task runs on a node
-	 * other than the first one used by the address space. This is
-	 * largely because migrations are driven by what CPU the task
-	 * is running on. If it's never scheduled on another node, it'll
-	 * not migrate so why bother trapping the fault.
-	 */
-	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-		mm->first_nid = numa_node_id();
-	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-		/* Are we running on a new node yet? */
-		if (numa_node_id() == mm->first_nid &&
-		    !sched_feat_numa(NUMA_FORCE))
-			return;
-
-		mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-	}
-
-	/*
-	 * Reset the scan period if enough time has gone by. Objective is that
-	 * scanning will be reduced if pages are properly placed. As tasks
-	 * can enter different phases this needs to be re-examined. Lacking
-	 * proper tracking of reference behaviour, this blunt hammer is used.
-	 */
-	migrate = mm->numa_next_reset;
-	if (time_after(now, migrate)) {
-		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-		xchg(&mm->numa_next_reset, next_scan);
-	}
-
-	/*
-	 * Enforce maximal scan/migration frequency..
-	 */
-	migrate = mm->numa_next_scan;
-	if (time_before(now, migrate))
-		return;
-
-	if (p->numa_scan_period == 0)
-		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-
-	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
-	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
-		return;
-
-	/*
-	 * Do not set pte_numa if the current running node is rate-limited.
-	 * This loses statistics on the fault but if we are unwilling to
-	 * migrate to this node, it is less likely we can do useful work
-	 */
-	if (migrate_ratelimited(numa_node_id()))
-		return;
-
-	start = mm->numa_scan_offset;
-	pages = sysctl_numa_balancing_scan_size;
-	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
-	if (!pages)
-		return;
-
-	down_read(&mm->mmap_sem);
-	vma = find_vma(mm, start);
-	if (!vma) {
-		reset_ptenuma_scan(p);
-		start = 0;
-		vma = mm->mmap;
-	}
-	for (; vma; vma = vma->vm_next) {
-		if (!vma_migratable(vma))
-			continue;
-
-		/* Skip small VMAs. They are not likely to be of relevance */
-		if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
-			continue;
-
-		do {
-			start = max(start, vma->vm_start);
-			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
-			end = min(end, vma->vm_end);
-			pages -= change_prot_numa(vma, start, end);
-
-			start = end;
-			if (pages <= 0)
-				goto out;
-		} while (end != vma->vm_end);
-	}
-
-out:
-	/*
-	 * It is possible to reach the end of the VMA list but the last few VMAs are
-	 * not guaranteed to the vma_migratable. If they are not, we would find the
-	 * !migratable VMA on the next scan but not reset the scanner to the start
-	 * so check it now.
-	 */
-	if (vma)
-		mm->numa_scan_offset = start;
-	else
-		reset_ptenuma_scan(p);
-	up_read(&mm->mmap_sem);
-}
-
-/*
- * Drive the periodic memory faults..
- */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
-{
-	struct callback_head *work = &curr->numa_work;
-	u64 period, now;
-
-	/*
-	 * We don't care about NUMA placement if we don't have memory.
-	 */
-	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
-		return;
-
-	/*
-	 * Using runtime rather than walltime has the dual advantage that
-	 * we (mostly) drive the selection from busy threads and that the
-	 * task needs to have done some actual work before we bother with
-	 * NUMA placement.
-	 */
-	now = curr->se.sum_exec_runtime;
-	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
-
-	if (now - curr->node_stamp > period) {
-		if (!curr->node_stamp)
-			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-		curr->node_stamp = now;
-
-		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
-			task_work_add(curr, work, true);
-		}
-	}
-}
-#else
-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5725,9 +5501,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
-	if (sched_feat_numa(NUMA))
-		task_tick_numa(rq, curr);
-
 	update_rq_runnable_avg(rq, 1);
 }
 
diff --git a/trunk/kernel/sched/features.h b/trunk/kernel/sched/features.h
index 1ad1d2b5395f..e68e69ab917d 100644
--- a/trunk/kernel/sched/features.h
+++ b/trunk/kernel/sched/features.h
@@ -66,14 +66,3 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
-
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
- * for debugging the core machinery.
- */
-#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA,	false)
-SCHED_FEAT(NUMA_FORCE,	false)
-#endif
diff --git a/trunk/kernel/sched/sched.h b/trunk/kernel/sched/sched.h
index fc886441436a..5eca173b563f 100644
--- a/trunk/kernel/sched/sched.h
+++ b/trunk/kernel/sched/sched.h
@@ -663,18 +663,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
-#ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
-
 static inline u64 global_rt_period(void)
 {
 	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/trunk/kernel/seccomp.c b/trunk/kernel/seccomp.c
index 5af44b593770..ee376beedaf9 100644
--- a/trunk/kernel/seccomp.c
+++ b/trunk/kernel/seccomp.c
@@ -396,29 +396,25 @@ int __secure_computing(int this_syscall)
 #ifdef CONFIG_SECCOMP_FILTER
 	case SECCOMP_MODE_FILTER: {
 		int data;
-		struct pt_regs *regs = task_pt_regs(current);
 		ret = seccomp_run_filters(this_syscall);
 		data = ret & SECCOMP_RET_DATA;
 		ret &= SECCOMP_RET_ACTION;
 		switch (ret) {
 		case SECCOMP_RET_ERRNO:
 			/* Set the low-order 16-bits as a errno. */
-			syscall_set_return_value(current, regs,
+			syscall_set_return_value(current, task_pt_regs(current),
 						 -data, 0);
 			goto skip;
 		case SECCOMP_RET_TRAP:
 			/* Show the handler the original registers. */
-			syscall_rollback(current, regs);
+			syscall_rollback(current, task_pt_regs(current));
 			/* Let the filter pass back 16 bits of data. */
 			seccomp_send_sigsys(this_syscall, data);
 			goto skip;
 		case SECCOMP_RET_TRACE:
 			/* Skip these calls if there is no tracer. */
-			if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-				syscall_set_return_value(current, regs,
-							 -ENOSYS, 0);
+			if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
 				goto skip;
-			}
 			/* Allow the BPF to provide the event message */
 			ptrace_event(PTRACE_EVENT_SECCOMP, data);
 			/*
@@ -429,9 +425,6 @@ int __secure_computing(int this_syscall)
 			 */
 			if (fatal_signal_pending(current))
 				break;
-			if (syscall_get_nr(current, regs) < 0)
-				goto skip;  /* Explicit request to skip. */
-
 			return 0;
 		case SECCOMP_RET_ALLOW:
 			return 0;
diff --git a/trunk/kernel/sysctl.c b/trunk/kernel/sysctl.c
index c88878db491e..33f71f37267e 100644
--- a/trunk/kernel/sysctl.c
+++ b/trunk/kernel/sysctl.c
@@ -256,11 +256,9 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
 
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -303,7 +301,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
-#ifdef CONFIG_SMP
 	{
 		.procname	= "sched_tunable_scaling",
 		.data		= &sysctl_sched_tunable_scaling,
@@ -350,45 +347,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_NUMA_BALANCING
-	{
-		.procname	= "numa_balancing_scan_delay_ms",
-		.data		= &sysctl_numa_balancing_scan_delay,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_min_ms",
-		.data		= &sysctl_numa_balancing_scan_period_min,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_reset",
-		.data		= &sysctl_numa_balancing_scan_period_reset,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_max_ms",
-		.data		= &sysctl_numa_balancing_scan_period_max,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_size_mb",
-		.data		= &sysctl_numa_balancing_scan_size,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
diff --git a/trunk/lib/swiotlb.c b/trunk/lib/swiotlb.c
index 196b06984dec..f114bf6a8e13 100644
--- a/trunk/lib/swiotlb.c
+++ b/trunk/lib/swiotlb.c
@@ -57,7 +57,7 @@ int swiotlb_force;
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
-static phys_addr_t io_tlb_start, io_tlb_end;
+static char *io_tlb_start, *io_tlb_end;
 
 /*
  * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
@@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs;
  */
 static unsigned long io_tlb_overflow = 32*1024;
 
-static phys_addr_t io_tlb_overflow_buffer;
+static void *io_tlb_overflow_buffer;
 
 /*
  * This is a free list describing the number of free entries available from
@@ -125,37 +125,26 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 void swiotlb_print_info(void)
 {
 	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-	unsigned char *vstart, *vend;
+	phys_addr_t pstart, pend;
 
-	vstart = phys_to_virt(io_tlb_start);
-	vend = phys_to_virt(io_tlb_end);
+	pstart = virt_to_phys(io_tlb_start);
+	pend = virt_to_phys(io_tlb_end);
 
 	printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
-	       (unsigned long long)io_tlb_start,
-	       (unsigned long long)io_tlb_end,
-	       bytes >> 20, vstart, vend - 1);
+	       (unsigned long long)pstart, (unsigned long long)pend - 1,
+	       bytes >> 20, io_tlb_start, io_tlb_end - 1);
 }
 
 void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
-	void *v_overflow_buffer;
 	unsigned long i, bytes;
 
 	bytes = nslabs << IO_TLB_SHIFT;
 
 	io_tlb_nslabs = nslabs;
-	io_tlb_start = __pa(tlb);
+	io_tlb_start = tlb;
 	io_tlb_end = io_tlb_start + bytes;
 
-	/*
-	 * Get the overflow emergency buffer
-	 */
-	v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
-	if (!v_overflow_buffer)
-		panic("Cannot allocate SWIOTLB overflow buffer!\n");
-
-	io_tlb_overflow_buffer = __pa(v_overflow_buffer);
-
 	/*
 	 * Allocate and initialize the free list array.  This array is used
 	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
@@ -167,6 +156,12 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 	io_tlb_index = 0;
 	io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
 
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
+	if (!io_tlb_overflow_buffer)
+		panic("Cannot allocate SWIOTLB overflow buffer!\n");
 	if (verbose)
 		swiotlb_print_info();
 }
@@ -178,7 +173,6 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 static void __init
 swiotlb_init_with_default_size(size_t default_size, int verbose)
 {
-	unsigned char *vstart;
 	unsigned long bytes;
 
 	if (!io_tlb_nslabs) {
@@ -191,11 +185,11 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
-	if (!vstart)
+	io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
+	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 
-	swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose);
+	swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose);
 }
 
 void __init
@@ -213,7 +207,6 @@ int
 swiotlb_late_init_with_default_size(size_t default_size)
 {
 	unsigned long bytes, req_nslabs = io_tlb_nslabs;
-	unsigned char *vstart = NULL;
 	unsigned int order;
 	int rc = 0;
 
@@ -230,14 +223,14 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-		vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
-						  order);
-		if (vstart)
+		io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+							order);
+		if (io_tlb_start)
 			break;
 		order--;
 	}
 
-	if (!vstart) {
+	if (!io_tlb_start) {
 		io_tlb_nslabs = req_nslabs;
 		return -ENOMEM;
 	}
@@ -246,9 +239,9 @@ swiotlb_late_init_with_default_size(size_t default_size)
 		       "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
 		io_tlb_nslabs = SLABS_PER_PAGE << order;
 	}
-	rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
+	rc = swiotlb_late_init_with_tbl(io_tlb_start, io_tlb_nslabs);
 	if (rc)
-		free_pages((unsigned long)vstart, order);
+		free_pages((unsigned long)io_tlb_start, order);
 	return rc;
 }
 
@@ -256,25 +249,14 @@ int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
 	unsigned long i, bytes;
-	unsigned char *v_overflow_buffer;
 
 	bytes = nslabs << IO_TLB_SHIFT;
 
 	io_tlb_nslabs = nslabs;
-	io_tlb_start = virt_to_phys(tlb);
+	io_tlb_start = tlb;
 	io_tlb_end = io_tlb_start + bytes;
 
-	memset(tlb, 0, bytes);
-
-	/*
-	 * Get the overflow emergency buffer
-	 */
-	v_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
-						     get_order(io_tlb_overflow));
-	if (!v_overflow_buffer)
-		goto cleanup2;
-
-	io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
+	memset(io_tlb_start, 0, bytes);
 
 	/*
 	 * Allocate and initialize the free list array.  This array is used
@@ -284,7 +266,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
 	                              get_order(io_tlb_nslabs * sizeof(int)));
 	if (!io_tlb_list)
-		goto cleanup3;
+		goto cleanup2;
 
 	for (i = 0; i < io_tlb_nslabs; i++)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
@@ -295,10 +277,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 				 get_order(io_tlb_nslabs *
 					   sizeof(phys_addr_t)));
 	if (!io_tlb_orig_addr)
-		goto cleanup4;
+		goto cleanup3;
 
 	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
 
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+	                                          get_order(io_tlb_overflow));
+	if (!io_tlb_overflow_buffer)
+		goto cleanup4;
+
 	swiotlb_print_info();
 
 	late_alloc = 1;
@@ -306,42 +296,42 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	return 0;
 
 cleanup4:
+	free_pages((unsigned long)io_tlb_orig_addr,
+		   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+	io_tlb_orig_addr = NULL;
+cleanup3:
 	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
 	                                                 sizeof(int)));
 	io_tlb_list = NULL;
-cleanup3:
-	free_pages((unsigned long)v_overflow_buffer,
-		   get_order(io_tlb_overflow));
-	io_tlb_overflow_buffer = 0;
 cleanup2:
-	io_tlb_end = 0;
-	io_tlb_start = 0;
+	io_tlb_end = NULL;
+	io_tlb_start = NULL;
 	io_tlb_nslabs = 0;
 	return -ENOMEM;
 }
 
 void __init swiotlb_free(void)
 {
-	if (!io_tlb_orig_addr)
+	if (!io_tlb_overflow_buffer)
 		return;
 
 	if (late_alloc) {
-		free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer),
+		free_pages((unsigned long)io_tlb_overflow_buffer,
 			   get_order(io_tlb_overflow));
 		free_pages((unsigned long)io_tlb_orig_addr,
 			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
 		free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
 								 sizeof(int)));
-		free_pages((unsigned long)phys_to_virt(io_tlb_start),
+		free_pages((unsigned long)io_tlb_start,
 			   get_order(io_tlb_nslabs << IO_TLB_SHIFT));
 	} else {
-		free_bootmem_late(io_tlb_overflow_buffer,
+		free_bootmem_late(__pa(io_tlb_overflow_buffer),
 				  PAGE_ALIGN(io_tlb_overflow));
 		free_bootmem_late(__pa(io_tlb_orig_addr),
 				  PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
 		free_bootmem_late(__pa(io_tlb_list),
 				  PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
-		free_bootmem_late(io_tlb_start,
+		free_bootmem_late(__pa(io_tlb_start),
 				  PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
 	}
 	io_tlb_nslabs = 0;
@@ -349,21 +339,21 @@ void __init swiotlb_free(void)
 
 static int is_swiotlb_buffer(phys_addr_t paddr)
 {
-	return paddr >= io_tlb_start && paddr < io_tlb_end;
+	return paddr >= virt_to_phys(io_tlb_start) &&
+		paddr < virt_to_phys(io_tlb_end);
 }
 
 /*
  * Bounce: copy the swiotlb buffer back to the original dma location
  */
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
-			   size_t size, enum dma_data_direction dir)
+void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+		    enum dma_data_direction dir)
 {
-	unsigned long pfn = PFN_DOWN(orig_addr);
-	unsigned char *vaddr = phys_to_virt(tlb_addr);
+	unsigned long pfn = PFN_DOWN(phys);
 
 	if (PageHighMem(pfn_to_page(pfn))) {
 		/* The buffer does not have a mapping.  Map it in and copy */
-		unsigned int offset = orig_addr & ~PAGE_MASK;
+		unsigned int offset = phys & ~PAGE_MASK;
 		char *buffer;
 		unsigned int sz = 0;
 		unsigned long flags;
@@ -374,31 +364,32 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 			local_irq_save(flags);
 			buffer = kmap_atomic(pfn_to_page(pfn));
 			if (dir == DMA_TO_DEVICE)
-				memcpy(vaddr, buffer + offset, sz);
+				memcpy(dma_addr, buffer + offset, sz);
 			else
-				memcpy(buffer + offset, vaddr, sz);
+				memcpy(buffer + offset, dma_addr, sz);
 			kunmap_atomic(buffer);
 			local_irq_restore(flags);
 
 			size -= sz;
 			pfn++;
-			vaddr += sz;
+			dma_addr += sz;
 			offset = 0;
 		}
-	} else if (dir == DMA_TO_DEVICE) {
-		memcpy(vaddr, phys_to_virt(orig_addr), size);
 	} else {
-		memcpy(phys_to_virt(orig_addr), vaddr, size);
+		if (dir == DMA_TO_DEVICE)
+			memcpy(dma_addr, phys_to_virt(phys), size);
+		else
+			memcpy(phys_to_virt(phys), dma_addr, size);
 	}
 }
+EXPORT_SYMBOL_GPL(swiotlb_bounce);
 
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-				   dma_addr_t tbl_dma_addr,
-				   phys_addr_t orig_addr, size_t size,
-				   enum dma_data_direction dir)
+void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
+			     phys_addr_t phys, size_t size,
+			     enum dma_data_direction dir)
 {
 	unsigned long flags;
-	phys_addr_t tlb_addr;
+	char *dma_addr;
 	unsigned int nslots, stride, index, wrap;
 	int i;
 	unsigned long mask;
@@ -462,7 +453,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 				io_tlb_list[i] = 0;
 			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
 				io_tlb_list[i] = ++count;
-			tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+			dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
 
 			/*
 			 * Update the indices to avoid searching in the next
@@ -480,7 +471,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 
 not_found:
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
-	return SWIOTLB_MAP_ERROR;
+	return NULL;
 found:
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 
@@ -490,11 +481,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	 * needed.
 	 */
 	for (i = 0; i < nslots; i++)
-		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
-		swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
+		swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
 
-	return tlb_addr;
+	return dma_addr;
 }
 EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
 
@@ -502,10 +493,11 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
  * Allocates bounce buffer and returns its kernel virtual address.
  */
 
-phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size,
-		       enum dma_data_direction dir)
+static void *
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+	   enum dma_data_direction dir)
 {
-	dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
 
 	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
 }
@@ -513,19 +505,20 @@ phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 /*
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t size, enum dma_data_direction dir)
+void
+swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
+			enum dma_data_direction dir)
 {
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	phys_addr_t phys = io_tlb_orig_addr[index];
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if (orig_addr && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
-		swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
+	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+		swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -554,27 +547,26 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 }
 EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single);
 
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
-			     size_t size, enum dma_data_direction dir,
-			     enum dma_sync_target target)
+void
+swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
+			enum dma_data_direction dir,
+			enum dma_sync_target target)
 {
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	phys_addr_t phys = io_tlb_orig_addr[index];
 
-	orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+	phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
 
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_FROM_DEVICE);
+			swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
 		else
 			BUG_ON(dir != DMA_TO_DEVICE);
 		break;
 	case SYNC_FOR_DEVICE:
 		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_TO_DEVICE);
+			swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
 		else
 			BUG_ON(dir != DMA_FROM_DEVICE);
 		break;
@@ -597,15 +589,12 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		dma_mask = hwdev->coherent_dma_mask;
 
 	ret = (void *)__get_free_pages(flags, order);
-	if (ret) {
-		dev_addr = swiotlb_virt_to_bus(hwdev, ret);
-		if (dev_addr + size - 1 > dma_mask) {
-			/*
-			 * The allocated memory isn't reachable by the device.
-			 */
-			free_pages((unsigned long) ret, order);
-			ret = NULL;
-		}
+	if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) {
+		/*
+		 * The allocated memory isn't reachable by the device.
+		 */
+		free_pages((unsigned long) ret, order);
+		ret = NULL;
 	}
 	if (!ret) {
 		/*
@@ -613,29 +602,25 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 * GFP_DMA memory; fall back on map_single(), which
 		 * will grab memory from the lowest available address range.
 		 */
-		phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
-		if (paddr == SWIOTLB_MAP_ERROR)
+		ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
+		if (!ret)
 			return NULL;
+	}
 
-		ret = phys_to_virt(paddr);
-		dev_addr = phys_to_dma(hwdev, paddr);
+	memset(ret, 0, size);
+	dev_addr = swiotlb_virt_to_bus(hwdev, ret);
 
-		/* Confirm address can be DMA'd by device */
-		if (dev_addr + size - 1 > dma_mask) {
-			printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
-			       (unsigned long long)dma_mask,
-			       (unsigned long long)dev_addr);
+	/* Confirm address can be DMA'd by device */
+	if (dev_addr + size - 1 > dma_mask) {
+		printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
+		       (unsigned long long)dma_mask,
+		       (unsigned long long)dev_addr);
 
-			/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-			swiotlb_tbl_unmap_single(hwdev, paddr,
-						 size, DMA_TO_DEVICE);
-			return NULL;
-		}
+		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+		swiotlb_tbl_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+		return NULL;
 	}
-
 	*dma_handle = dev_addr;
-	memset(ret, 0, size);
-
 	return ret;
 }
 EXPORT_SYMBOL(swiotlb_alloc_coherent);
@@ -651,7 +636,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 		free_pages((unsigned long)vaddr, get_order(size));
 	else
 		/* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE);
+		swiotlb_tbl_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
 }
 EXPORT_SYMBOL(swiotlb_free_coherent);
 
@@ -692,8 +677,9 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 			    enum dma_data_direction dir,
 			    struct dma_attrs *attrs)
 {
-	phys_addr_t map, phys = page_to_phys(page) + offset;
+	phys_addr_t phys = page_to_phys(page) + offset;
 	dma_addr_t dev_addr = phys_to_dma(dev, phys);
+	void *map;
 
 	BUG_ON(dir == DMA_NONE);
 	/*
@@ -704,19 +690,23 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	if (dma_capable(dev, dev_addr, size) && !swiotlb_force)
 		return dev_addr;
 
-	/* Oh well, have to allocate and map a bounce buffer. */
+	/*
+	 * Oh well, have to allocate and map a bounce buffer.
+	 */
 	map = map_single(dev, phys, size, dir);
-	if (map == SWIOTLB_MAP_ERROR) {
+	if (!map) {
 		swiotlb_full(dev, size, dir, 1);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
+		map = io_tlb_overflow_buffer;
 	}
 
-	dev_addr = phys_to_dma(dev, map);
+	dev_addr = swiotlb_virt_to_bus(dev, map);
 
-	/* Ensure that the address returned is DMA'ble */
+	/*
+	 * Ensure that the address returned is DMA'ble
+	 */
 	if (!dma_capable(dev, dev_addr, size)) {
 		swiotlb_tbl_unmap_single(dev, map, size, dir);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
+		dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer);
 	}
 
 	return dev_addr;
@@ -739,7 +729,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 	BUG_ON(dir == DMA_NONE);
 
 	if (is_swiotlb_buffer(paddr)) {
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
+		swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
 		return;
 	}
 
@@ -783,7 +773,8 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 	BUG_ON(dir == DMA_NONE);
 
 	if (is_swiotlb_buffer(paddr)) {
-		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
+		swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+				       target);
 		return;
 	}
 
@@ -840,9 +831,9 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 
 		if (swiotlb_force ||
 		    !dma_capable(hwdev, dev_addr, sg->length)) {
-			phys_addr_t map = map_single(hwdev, sg_phys(sg),
-						     sg->length, dir);
-			if (map == SWIOTLB_MAP_ERROR) {
+			void *map = map_single(hwdev, sg_phys(sg),
+					       sg->length, dir);
+			if (!map) {
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
 				swiotlb_full(hwdev, sg->length, dir, 0);
@@ -851,7 +842,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sgl[0].dma_length = 0;
 				return 0;
 			}
-			sg->dma_address = phys_to_dma(hwdev, map);
+			sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg->dma_length = sg->length;
@@ -934,7 +925,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
 }
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
@@ -947,6 +938,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+	return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/trunk/mm/compaction.c b/trunk/mm/compaction.c
index 5ad7f4f4d6f7..129791218226 100644
--- a/trunk/mm/compaction.c
+++ b/trunk/mm/compaction.c
@@ -303,10 +303,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	if (blockpfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, total_isolated, false);
 
-	count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
-	if (total_isolated)
-		count_vm_events(COMPACTISOLATED, total_isolated);
-
 	return total_isolated;
 }
 
@@ -613,10 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
-	count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
-	if (nr_isolated)
-		count_vm_events(COMPACTISOLATED, nr_isolated);
-
 	return low_pfn;
 }
 
@@ -1023,11 +1015,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				(unsigned long)cc, false,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
-				MR_COMPACTION);
+				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
 
+		count_vm_event(COMPACTBLOCKS);
+		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
+		if (nr_remaining)
+			count_vm_events(COMPACTPAGEFAILED, nr_remaining);
 		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
 						nr_remaining);
 
diff --git a/trunk/mm/huge_memory.c b/trunk/mm/huge_memory.c
index d7ee1691fd21..827d9c813051 100644
--- a/trunk/mm/huge_memory.c
+++ b/trunk/mm/huge_memory.c
@@ -19,7 +19,6 @@
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/migrate.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -691,7 +690,7 @@ static int __init setup_transparent_hugepage(char *str)
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))
 		pmd = pmd_mkwrite(pmd);
@@ -849,8 +848,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * run pte_offset_map on the pmd, if an huge pmd could
 	 * materialize from under us from a different thread.
 	 */
-	if (unlikely(pmd_none(*pmd)) &&
-	    unlikely(__pte_alloc(mm, vma, pmd, address)))
+	if (unlikely(__pte_alloc(mm, vma, pmd, address)))
 		return VM_FAULT_OOM;
 	/* if an huge pmd materialized from under us just retry later */
 	if (unlikely(pmd_trans_huge(*pmd)))
@@ -1289,81 +1287,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 	return page;
 }
 
-/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
-{
-	struct page *page;
-	unsigned long haddr = addr & HPAGE_PMD_MASK;
-	int target_nid;
-	int current_nid = -1;
-	bool migrated;
-	bool page_locked = false;
-
-	spin_lock(&mm->page_table_lock);
-	if (unlikely(!pmd_same(pmd, *pmdp)))
-		goto out_unlock;
-
-	page = pmd_page(pmd);
-	get_page(page);
-	current_nid = page_to_nid(page);
-	count_vm_numa_event(NUMA_HINT_FAULTS);
-	if (current_nid == numa_node_id())
-		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
-
-	target_nid = mpol_misplaced(page, vma, haddr);
-	if (target_nid == -1) {
-		put_page(page);
-		goto clear_pmdnuma;
-	}
-
-	/* Acquire the page lock to serialise THP migrations */
-	spin_unlock(&mm->page_table_lock);
-	lock_page(page);
-	page_locked = true;
-
-	/* Confirm the PTE did not while locked */
-	spin_lock(&mm->page_table_lock);
-	if (unlikely(!pmd_same(pmd, *pmdp))) {
-		unlock_page(page);
-		put_page(page);
-		goto out_unlock;
-	}
-	spin_unlock(&mm->page_table_lock);
-
-	/* Migrate the THP to the requested node */
-	migrated = migrate_misplaced_transhuge_page(mm, vma,
-				pmdp, pmd, addr,
-				page, target_nid);
-	if (migrated)
-		current_nid = target_nid;
-	else {
-		spin_lock(&mm->page_table_lock);
-		if (unlikely(!pmd_same(pmd, *pmdp))) {
-			unlock_page(page);
-			goto out_unlock;
-		}
-		goto clear_pmdnuma;
-	}
-
-	task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
-	return 0;
-
-clear_pmdnuma:
-	pmd = pmd_mknonnuma(pmd);
-	set_pmd_at(mm, haddr, pmdp, pmd);
-	VM_BUG_ON(pmd_numa(*pmdp));
-	update_mmu_cache_pmd(vma, addr, pmdp);
-	if (page_locked)
-		unlock_page(page);
-
-out_unlock:
-	spin_unlock(&mm->page_table_lock);
-	if (current_nid != -1)
-		task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
-	return 0;
-}
-
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 {
@@ -1452,7 +1375,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 }
 
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, pgprot_t newprot, int prot_numa)
+		unsigned long addr, pgprot_t newprot)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int ret = 0;
@@ -1460,17 +1383,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		pmd_t entry;
 		entry = pmdp_get_and_clear(mm, addr, pmd);
-		if (!prot_numa)
-			entry = pmd_modify(entry, newprot);
-		else {
-			struct page *page = pmd_page(*pmd);
-
-			/* only check non-shared pages */
-			if (page_mapcount(page) == 1 &&
-			    !pmd_numa(*pmd)) {
-				entry = pmd_mknuma(entry);
-			}
-		}
+		entry = pmd_modify(entry, newprot);
 		BUG_ON(pmd_write(entry));
 		set_pmd_at(mm, addr, pmd, entry);
 		spin_unlock(&vma->vm_mm->page_table_lock);
@@ -1561,7 +1474,7 @@ static int __split_huge_page_splitting(struct page *page,
 		 * We can't temporarily set the pmd to null in order
 		 * to split it, the pmd must remain marked huge at all
 		 * times or the VM won't take the pmd_trans_huge paths
-		 * and it won't wait on the anon_vma->root->rwsem to
+		 * and it won't wait on the anon_vma->root->mutex to
 		 * serialize against split_huge_page*.
 		 */
 		pmdp_splitting_flush(vma, address, pmd);
@@ -1652,7 +1565,6 @@ static void __split_huge_page_refcount(struct page *page)
 		page_tail->mapping = page->mapping;
 
 		page_tail->index = page->index + i;
-		page_xchg_last_nid(page_tail, page_last_nid(page));
 
 		BUG_ON(!PageAnon(page_tail));
 		BUG_ON(!PageUptodate(page_tail));
@@ -1720,8 +1632,6 @@ static int __split_huge_page_map(struct page *page,
 				BUG_ON(page_mapcount(page) != 1);
 			if (!pmd_young(*pmd))
 				entry = pte_mkold(entry);
-			if (pmd_numa(*pmd))
-				entry = pte_mknuma(entry);
 			pte = pte_offset_map(&_pmd, haddr);
 			BUG_ON(!pte_none(*pte));
 			set_pte_at(mm, haddr, pte, entry);
@@ -1764,7 +1674,7 @@ static int __split_huge_page_map(struct page *page,
 	return ret;
 }
 
-/* must be called with anon_vma->root->rwsem held */
+/* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
 			      struct anon_vma *anon_vma)
 {
@@ -1819,7 +1729,7 @@ int split_huge_page(struct page *page)
 
 	BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
 	BUG_ON(!PageAnon(page));
-	anon_vma = page_lock_anon_vma_read(page);
+	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		goto out;
 	ret = 0;
@@ -1832,7 +1742,7 @@ int split_huge_page(struct page *page)
 
 	BUG_ON(PageCompound(page));
 out_unlock:
-	page_unlock_anon_vma_read(anon_vma);
+	page_unlock_anon_vma(anon_vma);
 out:
 	return ret;
 }
@@ -2324,7 +2234,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	if (pmd_trans_huge(*pmd))
 		goto out;
 
-	anon_vma_lock_write(vma->anon_vma);
+	anon_vma_lock(vma->anon_vma);
 
 	pte = pte_offset_map(pmd, address);
 	ptl = pte_lockptr(mm, pmd);
diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c
index e5318c7793ae..88e7293b96bd 100644
--- a/trunk/mm/hugetlb.c
+++ b/trunk/mm/hugetlb.c
@@ -3016,7 +3016,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	return i ? i : -EFAULT;
 }
 
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+void hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -3024,7 +3024,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t pte;
 	struct hstate *h = hstate_vma(vma);
-	unsigned long pages = 0;
 
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
@@ -3035,15 +3034,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
-		if (huge_pmd_unshare(mm, &address, ptep)) {
-			pages++;
+		if (huge_pmd_unshare(mm, &address, ptep))
 			continue;
-		}
 		if (!huge_pte_none(huge_ptep_get(ptep))) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
 			set_huge_pte_at(mm, address, ptep, pte);
-			pages++;
 		}
 	}
 	spin_unlock(&mm->page_table_lock);
@@ -3055,8 +3051,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 */
 	flush_tlb_range(vma, start, end);
 	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
-	return pages << h->order;
 }
 
 int hugetlb_reserve_pages(struct inode *inode,
diff --git a/trunk/mm/internal.h b/trunk/mm/internal.h
index d597f94cc205..52d1fa957194 100644
--- a/trunk/mm/internal.h
+++ b/trunk/mm/internal.h
@@ -217,18 +217,15 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
 	if (TestClearPageMlocked(page)) {
 		unsigned long flags;
-		int nr_pages = hpage_nr_pages(page);
 
 		local_irq_save(flags);
-		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+		__dec_zone_page_state(page, NR_MLOCK);
 		SetPageMlocked(newpage);
-		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
+		__inc_zone_page_state(newpage, NR_MLOCK);
 		local_irq_restore(flags);
 	}
 }
 
-extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
 				 struct vm_area_struct *vma);
diff --git a/trunk/mm/ksm.c b/trunk/mm/ksm.c
index 82dfb4b54321..382d930a0bf1 100644
--- a/trunk/mm/ksm.c
+++ b/trunk/mm/ksm.c
@@ -1624,7 +1624,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
 			vma = vmac->vma;
@@ -1678,7 +1678,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
 			vma = vmac->vma;
@@ -1731,7 +1731,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
 			vma = vmac->vma;
diff --git a/trunk/mm/memcontrol.c b/trunk/mm/memcontrol.c
index bbfac5063ca8..6c055929c8cc 100644
--- a/trunk/mm/memcontrol.c
+++ b/trunk/mm/memcontrol.c
@@ -3289,18 +3289,15 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 				  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 
 	*memcgp = NULL;
 
+	VM_BUG_ON(PageTransHuge(page));
 	if (mem_cgroup_disabled())
 		return;
 
-	if (PageTransHuge(page))
-		nr_pages <<= compound_order(page);
-
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
@@ -3362,7 +3359,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 	 * charged to the res_counter since we plan on replacing the
 	 * old one and only one page is going to be left afterwards.
 	 */
-	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
+	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
 }
 
 /* remove redundant charge if migration failed*/
diff --git a/trunk/mm/memory-failure.c b/trunk/mm/memory-failure.c
index c6e4dd3e1c08..108c52fa60f6 100644
--- a/trunk/mm/memory-failure.c
+++ b/trunk/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 	struct anon_vma *av;
 	pgoff_t pgoff;
 
-	av = page_lock_anon_vma_read(page);
+	av = page_lock_anon_vma(page);
 	if (av == NULL)	/* Not actually mapped anymore */
 		return;
 
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 		}
 	}
 	read_unlock(&tasklist_lock);
-	page_unlock_anon_vma_read(av);
+	page_unlock_anon_vma(av);
 }
 
 /*
@@ -1566,8 +1566,7 @@ int soft_offline_page(struct page *page, int flags)
 					    page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-							false, MIGRATE_SYNC,
-							MR_MEMORY_FAILURE);
+							false, MIGRATE_SYNC);
 		if (ret) {
 			putback_lru_pages(&pagelist);
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c
index e6a3b933517e..db2e9e797a05 100644
--- a/trunk/mm/memory.c
+++ b/trunk/mm/memory.c
@@ -57,7 +57,6 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
-#include <linux/migrate.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -1504,8 +1503,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
 	}
-	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
-		goto no_page_table;
 	if (pmd_trans_huge(*pmd)) {
 		if (flags & FOLL_SPLIT) {
 			split_huge_page_pmd(vma, address, pmd);
@@ -1535,8 +1532,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 	pte = *ptep;
 	if (!pte_present(pte))
 		goto no_page;
-	if ((flags & FOLL_NUMA) && pte_numa(pte))
-		goto no_page;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
 
@@ -1688,19 +1683,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 	vm_flags &= (gup_flags & FOLL_FORCE) ?
 			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
-
-	/*
-	 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
-	 * would be called on PROT_NONE ranges. We must never invoke
-	 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
-	 * page faults would unprotect the PROT_NONE ranges if
-	 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
-	 * bitflag. So to avoid that, don't set FOLL_NUMA if
-	 * FOLL_FORCE is set.
-	 */
-	if (!(gup_flags & FOLL_FORCE))
-		gup_flags |= FOLL_NUMA;
-
 	i = 0;
 
 	do {
@@ -3430,169 +3412,6 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-				unsigned long addr, int current_nid)
-{
-	get_page(page);
-
-	count_vm_numa_event(NUMA_HINT_FAULTS);
-	if (current_nid == numa_node_id())
-		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
-
-	return mpol_misplaced(page, vma, addr);
-}
-
-int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	int current_nid = -1;
-	int target_nid;
-	bool migrated = false;
-
-	/*
-	* The "pte" at this point cannot be used safely without
-	* validation through pte_unmap_same(). It's of NUMA type but
-	* the pfn may be screwed if the read is non atomic.
-	*
-	* ptep_modify_prot_start is not called as this is clearing
-	* the _PAGE_NUMA bit and it is not really expected that there
-	* would be concurrent hardware modifications to the PTE.
-	*/
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
-	if (unlikely(!pte_same(*ptep, pte))) {
-		pte_unmap_unlock(ptep, ptl);
-		goto out;
-	}
-
-	pte = pte_mknonnuma(pte);
-	set_pte_at(mm, addr, ptep, pte);
-	update_mmu_cache(vma, addr, ptep);
-
-	page = vm_normal_page(vma, addr, pte);
-	if (!page) {
-		pte_unmap_unlock(ptep, ptl);
-		return 0;
-	}
-
-	current_nid = page_to_nid(page);
-	target_nid = numa_migrate_prep(page, vma, addr, current_nid);
-	pte_unmap_unlock(ptep, ptl);
-	if (target_nid == -1) {
-		/*
-		 * Account for the fault against the current node if it not
-		 * being replaced regardless of where the page is located.
-		 */
-		current_nid = numa_node_id();
-		put_page(page);
-		goto out;
-	}
-
-	/* Migrate to the requested node */
-	migrated = migrate_misplaced_page(page, target_nid);
-	if (migrated)
-		current_nid = target_nid;
-
-out:
-	if (current_nid != -1)
-		task_numa_fault(current_nid, 1, migrated);
-	return 0;
-}
-
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		     unsigned long addr, pmd_t *pmdp)
-{
-	pmd_t pmd;
-	pte_t *pte, *orig_pte;
-	unsigned long _addr = addr & PMD_MASK;
-	unsigned long offset;
-	spinlock_t *ptl;
-	bool numa = false;
-	int local_nid = numa_node_id();
-
-	spin_lock(&mm->page_table_lock);
-	pmd = *pmdp;
-	if (pmd_numa(pmd)) {
-		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
-		numa = true;
-	}
-	spin_unlock(&mm->page_table_lock);
-
-	if (!numa)
-		return 0;
-
-	/* we're in a page fault so some vma must be in the range */
-	BUG_ON(!vma);
-	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
-	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
-	VM_BUG_ON(offset >= PMD_SIZE);
-	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
-	pte += offset >> PAGE_SHIFT;
-	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
-		pte_t pteval = *pte;
-		struct page *page;
-		int curr_nid = local_nid;
-		int target_nid;
-		bool migrated;
-		if (!pte_present(pteval))
-			continue;
-		if (!pte_numa(pteval))
-			continue;
-		if (addr >= vma->vm_end) {
-			vma = find_vma(mm, addr);
-			/* there's a pte present so there must be a vma */
-			BUG_ON(!vma);
-			BUG_ON(addr < vma->vm_start);
-		}
-		if (pte_numa(pteval)) {
-			pteval = pte_mknonnuma(pteval);
-			set_pte_at(mm, addr, pte, pteval);
-		}
-		page = vm_normal_page(vma, addr, pteval);
-		if (unlikely(!page))
-			continue;
-		/* only check non-shared pages */
-		if (unlikely(page_mapcount(page) != 1))
-			continue;
-
-		/*
-		 * Note that the NUMA fault is later accounted to either
-		 * the node that is currently running or where the page is
-		 * migrated to.
-		 */
-		curr_nid = local_nid;
-		target_nid = numa_migrate_prep(page, vma, addr,
-					       page_to_nid(page));
-		if (target_nid == -1) {
-			put_page(page);
-			continue;
-		}
-
-		/* Migrate to the requested node */
-		pte_unmap_unlock(pte, ptl);
-		migrated = migrate_misplaced_page(page, target_nid);
-		if (migrated)
-			curr_nid = target_nid;
-		task_numa_fault(curr_nid, 1, migrated);
-
-		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	}
-	pte_unmap_unlock(orig_pte, ptl);
-
-	return 0;
-}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		     unsigned long addr, pmd_t *pmdp)
-{
-	BUG();
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3631,9 +3450,6 @@ int handle_pte_fault(struct mm_struct *mm,
 					pte, pmd, flags, entry);
 	}
 
-	if (pte_numa(entry))
-		return do_numa_page(mm, vma, address, entry, pte, pmd);
-
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
@@ -3704,11 +3520,8 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (pmd_trans_huge(orig_pmd)) {
 			unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-			if (pmd_numa(orig_pmd))
-				return do_huge_pmd_numa_page(mm, vma, address,
-							     orig_pmd, pmd);
-
-			if (dirty && !pmd_write(orig_pmd)) {
+			if (dirty && !pmd_write(orig_pmd) &&
+			    !pmd_trans_splitting(orig_pmd)) {
 				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
 							  orig_pmd);
 				/*
@@ -3723,21 +3536,16 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				huge_pmd_set_accessed(mm, vma, address, pmd,
 						      orig_pmd, dirty);
 			}
-
 			return 0;
 		}
 	}
 
-	if (pmd_numa(*pmd))
-		return do_pmd_numa_page(mm, vma, address, pmd);
-
 	/*
 	 * Use __pte_alloc instead of pte_alloc_map, because we can't
 	 * run pte_offset_map on the pmd, if an huge pmd could
 	 * materialize from under us from a different thread.
 	 */
-	if (unlikely(pmd_none(*pmd)) &&
-	    unlikely(__pte_alloc(mm, vma, pmd, address)))
+	if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
 		return VM_FAULT_OOM;
 	/* if an huge pmd materialized from under us just retry later */
 	if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/trunk/mm/memory_hotplug.c b/trunk/mm/memory_hotplug.c
index 962e353aa86f..518baa896e83 100644
--- a/trunk/mm/memory_hotplug.c
+++ b/trunk/mm/memory_hotplug.c
@@ -1055,8 +1055,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * migrate_pages returns # of failed pages.
 		 */
 		ret = migrate_pages(&source, alloc_migrate_target, 0,
-							true, MIGRATE_SYNC,
-							MR_MEMORY_HOTPLUG);
+							true, MIGRATE_SYNC);
 		if (ret)
 			putback_lru_pages(&source);
 	}
diff --git a/trunk/mm/mempolicy.c b/trunk/mm/mempolicy.c
index d1b315e98627..aaf54566cb6b 100644
--- a/trunk/mm/mempolicy.c
+++ b/trunk/mm/mempolicy.c
@@ -90,7 +90,6 @@
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
-#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -118,26 +117,6 @@ static struct mempolicy default_policy = {
 	.flags = MPOL_F_LOCAL,
 };
 
-static struct mempolicy preferred_node_policy[MAX_NUMNODES];
-
-static struct mempolicy *get_task_policy(struct task_struct *p)
-{
-	struct mempolicy *pol = p->mempolicy;
-	int node;
-
-	if (!pol) {
-		node = numa_node_id();
-		if (node != -1)
-			pol = &preferred_node_policy[node];
-
-		/* preferred_node_policy is not initialised early in boot */
-		if (!pol->mode)
-			pol = NULL;
-	}
-
-	return pol;
-}
-
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 	/*
@@ -275,7 +254,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	if (mode == MPOL_DEFAULT) {
 		if (nodes && !nodes_empty(*nodes))
 			return ERR_PTR(-EINVAL);
-		return NULL;
+		return NULL;	/* simply delete any existing policy */
 	}
 	VM_BUG_ON(!nodes);
 
@@ -290,10 +269,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 			     (flags & MPOL_F_RELATIVE_NODES)))
 				return ERR_PTR(-EINVAL);
 		}
-	} else if (mode == MPOL_LOCAL) {
-		if (!nodes_empty(*nodes))
-			return ERR_PTR(-EINVAL);
-		mode = MPOL_PREFERRED;
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -586,36 +561,6 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 	return 0;
 }
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
-/*
- * This is used to mark a range of virtual addresses to be inaccessible.
- * These are later cleared by a NUMA hinting fault. Depending on these
- * faults, pages may be migrated for better NUMA placement.
- *
- * This is assuming that NUMA faults are handled using PROT_NONE. If
- * an architecture makes a different choice, it will need further
- * changes to the core.
- */
-unsigned long change_prot_numa(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long end)
-{
-	int nr_updated;
-	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
-
-	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
-	if (nr_updated)
-		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
-
-	return nr_updated;
-}
-#else
-static unsigned long change_prot_numa(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long end)
-{
-	return 0;
-}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
-
 /*
  * Check if all pages in a range are on a set of nodes.
  * If pagelist != NULL then isolate pages from the LRU and
@@ -634,32 +579,22 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-		unsigned long endvma = vma->vm_end;
-
-		if (endvma > end)
-			endvma = end;
-		if (vma->vm_start > start)
-			start = vma->vm_start;
-
 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 			if (!vma->vm_next && vma->vm_end < end)
 				return ERR_PTR(-EFAULT);
 			if (prev && prev->vm_end < vma->vm_start)
 				return ERR_PTR(-EFAULT);
 		}
-
-		if (is_vm_hugetlb_page(vma))
-			goto next;
-
-		if (flags & MPOL_MF_LAZY) {
-			change_prot_numa(vma, start, endvma);
-			goto next;
-		}
-
-		if ((flags & MPOL_MF_STRICT) ||
+		if (!is_vm_hugetlb_page(vma) &&
+		    ((flags & MPOL_MF_STRICT) ||
 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-		      vma_migratable(vma))) {
+				vma_migratable(vma)))) {
+			unsigned long endvma = vma->vm_end;
 
+			if (endvma > end)
+				endvma = end;
+			if (vma->vm_start > start)
+				start = vma->vm_start;
 			err = check_pgd_range(vma, start, endvma, nodes,
 						flags, private);
 			if (err) {
@@ -667,7 +602,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 				break;
 			}
 		}
-next:
 		prev = vma;
 	}
 	return first;
@@ -1027,8 +961,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,
-							false, MIGRATE_SYNC,
-							MR_SYSCALL);
+							false, MIGRATE_SYNC);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
@@ -1200,7 +1133,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 	int err;
 	LIST_HEAD(pagelist);
 
-	if (flags & ~(unsigned long)MPOL_MF_VALID)
+	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
@@ -1223,9 +1157,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
-	if (flags & MPOL_MF_LAZY)
-		new->flags |= MPOL_F_MOF;
-
 	/*
 	 * If we are using the default policy then operation
 	 * on discontinuous address spaces is okay after all
@@ -1262,24 +1193,21 @@ static long do_mbind(unsigned long start, unsigned long len,
 	vma = check_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
 
-	err = PTR_ERR(vma);	/* maybe ... */
-	if (!IS_ERR(vma))
-		err = mbind_range(mm, start, end, new);
-
-	if (!err) {
+	err = PTR_ERR(vma);
+	if (!IS_ERR(vma)) {
 		int nr_failed = 0;
 
+		err = mbind_range(mm, start, end, new);
+
 		if (!list_empty(&pagelist)) {
-			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
 						(unsigned long)vma,
-						false, MIGRATE_SYNC,
-						MR_MEMPOLICY_MBIND);
+						false, MIGRATE_SYNC);
 			if (nr_failed)
 				putback_lru_pages(&pagelist);
 		}
 
-		if (nr_failed && (flags & MPOL_MF_STRICT))
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	} else
 		putback_lru_pages(&pagelist);
@@ -1618,7 +1546,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 struct mempolicy *get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_task_policy(task);
+	struct mempolicy *pol = task->mempolicy;
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -2028,7 +1956,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-	struct mempolicy *pol = get_task_policy(current);
+	struct mempolicy *pol = current->mempolicy;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 
@@ -2212,115 +2140,6 @@ static void sp_free(struct sp_node *n)
 	kmem_cache_free(sn_cache, n);
 }
 
-/**
- * mpol_misplaced - check whether current page node is valid in policy
- *
- * @page   - page to be checked
- * @vma    - vm area where page mapped
- * @addr   - virtual address where page mapped
- *
- * Lookup current policy node id for vma,addr and "compare to" page's
- * node id.
- *
- * Returns:
- *	-1	- not misplaced, page is in the right node
- *	node	- node id where the page should be
- *
- * Policy determination "mimics" alloc_page_vma().
- * Called from fault path where we know the vma and faulting address.
- */
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol;
-	struct zone *zone;
-	int curnid = page_to_nid(page);
-	unsigned long pgoff;
-	int polnid = -1;
-	int ret = -1;
-
-	BUG_ON(!vma);
-
-	pol = get_vma_policy(current, vma, addr);
-	if (!(pol->flags & MPOL_F_MOF))
-		goto out;
-
-	switch (pol->mode) {
-	case MPOL_INTERLEAVE:
-		BUG_ON(addr >= vma->vm_end);
-		BUG_ON(addr < vma->vm_start);
-
-		pgoff = vma->vm_pgoff;
-		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
-		polnid = offset_il_node(pol, vma, pgoff);
-		break;
-
-	case MPOL_PREFERRED:
-		if (pol->flags & MPOL_F_LOCAL)
-			polnid = numa_node_id();
-		else
-			polnid = pol->v.preferred_node;
-		break;
-
-	case MPOL_BIND:
-		/*
-		 * allows binding to multiple nodes.
-		 * use current page if in policy nodemask,
-		 * else select nearest allowed node, if any.
-		 * If no allowed nodes, use current [!misplaced].
-		 */
-		if (node_isset(curnid, pol->v.nodes))
-			goto out;
-		(void)first_zones_zonelist(
-				node_zonelist(numa_node_id(), GFP_HIGHUSER),
-				gfp_zone(GFP_HIGHUSER),
-				&pol->v.nodes, &zone);
-		polnid = zone->node;
-		break;
-
-	default:
-		BUG();
-	}
-
-	/* Migrate the page towards the node whose CPU is referencing it */
-	if (pol->flags & MPOL_F_MORON) {
-		int last_nid;
-
-		polnid = numa_node_id();
-
-		/*
-		 * Multi-stage node selection is used in conjunction
-		 * with a periodic migration fault to build a temporal
-		 * task<->page relation. By using a two-stage filter we
-		 * remove short/unlikely relations.
-		 *
-		 * Using P(p) ~ n_p / n_t as per frequentist
-		 * probability, we can equate a task's usage of a
-		 * particular page (n_p) per total usage of this
-		 * page (n_t) (in a given time-span) to a probability.
-		 *
-		 * Our periodic faults will sample this probability and
-		 * getting the same result twice in a row, given these
-		 * samples are fully independent, is then given by
-		 * P(n)^2, provided our sample period is sufficiently
-		 * short compared to the usage pattern.
-		 *
-		 * This quadric squishes small probabilities, making
-		 * it less likely we act on an unlikely task<->page
-		 * relation.
-		 */
-		last_nid = page_xchg_last_nid(page, polnid);
-		if (last_nid != polnid)
-			goto out;
-	}
-
-	if (curnid != polnid)
-		ret = polnid;
-out:
-	mpol_cond_put(pol);
-
-	return ret;
-}
-
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2486,50 +2305,6 @@ void mpol_free_shared_policy(struct shared_policy *p)
 	mutex_unlock(&p->mutex);
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static bool __initdata numabalancing_override;
-
-static void __init check_numabalancing_enable(void)
-{
-	bool numabalancing_default = false;
-
-	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
-		numabalancing_default = true;
-
-	if (nr_node_ids > 1 && !numabalancing_override) {
-		printk(KERN_INFO "Enabling automatic NUMA balancing. "
-			"Configure with numa_balancing= or sysctl");
-		set_numabalancing_state(numabalancing_default);
-	}
-}
-
-static int __init setup_numabalancing(char *str)
-{
-	int ret = 0;
-	if (!str)
-		goto out;
-	numabalancing_override = true;
-
-	if (!strcmp(str, "enable")) {
-		set_numabalancing_state(true);
-		ret = 1;
-	} else if (!strcmp(str, "disable")) {
-		set_numabalancing_state(false);
-		ret = 1;
-	}
-out:
-	if (!ret)
-		printk(KERN_WARNING "Unable to parse numa_balancing=\n");
-
-	return ret;
-}
-__setup("numa_balancing=", setup_numabalancing);
-#else
-static inline void __init check_numabalancing_enable(void)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
@@ -2545,15 +2320,6 @@ void __init numa_policy_init(void)
 				     sizeof(struct sp_node),
 				     0, SLAB_PANIC, NULL);
 
-	for_each_node(nid) {
-		preferred_node_policy[nid] = (struct mempolicy) {
-			.refcnt = ATOMIC_INIT(1),
-			.mode = MPOL_PREFERRED,
-			.flags = MPOL_F_MOF | MPOL_F_MORON,
-			.v = { .preferred_node = nid, },
-		};
-	}
-
 	/*
 	 * Set interleaving policy for system init. Interleaving is only
 	 * enabled across suitably sized nodes (default is >= 16MB), or
@@ -2580,8 +2346,6 @@ void __init numa_policy_init(void)
 
 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
 		printk("numa_policy_init: interleaving failed\n");
-
-	check_numabalancing_enable();
 }
 
 /* Reset policy of current process to default */
@@ -2598,13 +2362,14 @@ void numa_default_policy(void)
  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
  * Used only for mpol_parse_str() and mpol_to_str()
  */
+#define MPOL_LOCAL MPOL_MAX
 static const char * const policy_modes[] =
 {
 	[MPOL_DEFAULT]    = "default",
 	[MPOL_PREFERRED]  = "prefer",
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
-	[MPOL_LOCAL]      = "local",
+	[MPOL_LOCAL]      = "local"
 };
 
 
@@ -2650,12 +2415,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	if (flags)
 		*flags++ = '\0';	/* terminate mode string */
 
-	for (mode = 0; mode < MPOL_MAX; mode++) {
+	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
 		if (!strcmp(str, policy_modes[mode])) {
 			break;
 		}
 	}
-	if (mode >= MPOL_MAX)
+	if (mode > MPOL_LOCAL)
 		goto out;
 
 	switch (mode) {
diff --git a/trunk/mm/migrate.c b/trunk/mm/migrate.c
index 32efd8028bc9..cae02711181d 100644
--- a/trunk/mm/migrate.c
+++ b/trunk/mm/migrate.c
@@ -39,9 +39,6 @@
 
 #include <asm/tlbflush.h>
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/migrate.h>
-
 #include "internal.h"
 
 /*
@@ -296,7 +293,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page,
 		struct buffer_head *head, enum migrate_mode mode)
 {
-	int expected_count = 0;
+	int expected_count;
 	void **pslot;
 
 	if (!mapping) {
@@ -424,7 +421,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
  */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
-	if (PageHuge(page) || PageTransHuge(page))
+	if (PageHuge(page))
 		copy_huge_page(newpage, page);
 	else
 		copy_highpage(newpage, page);
@@ -768,7 +765,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	 */
 	if (PageAnon(page)) {
 		/*
-		 * Only page_lock_anon_vma_read() understands the subtleties of
+		 * Only page_lock_anon_vma() understands the subtleties of
 		 * getting a hold on an anon_vma from outside one of its mms.
 		 */
 		anon_vma = page_get_anon_vma(page);
@@ -1001,11 +998,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
  */
 int migrate_pages(struct list_head *from,
 		new_page_t get_new_page, unsigned long private, bool offlining,
-		enum migrate_mode mode, int reason)
+		enum migrate_mode mode)
 {
 	int retry = 1;
 	int nr_failed = 0;
-	int nr_succeeded = 0;
 	int pass = 0;
 	struct page *page;
 	struct page *page2;
@@ -1032,7 +1028,6 @@ int migrate_pages(struct list_head *from,
 				retry++;
 				break;
 			case MIGRATEPAGE_SUCCESS:
-				nr_succeeded++;
 				break;
 			default:
 				/* Permanent failure */
@@ -1043,12 +1038,6 @@ int migrate_pages(struct list_head *from,
 	}
 	rc = nr_failed + retry;
 out:
-	if (nr_succeeded)
-		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
-	if (nr_failed)
-		count_vm_events(PGMIGRATE_FAIL, nr_failed);
-	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
-
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
@@ -1187,8 +1176,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 	err = 0;
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_page_node,
-				(unsigned long)pm, 0, MIGRATE_SYNC,
-				MR_SYSCALL);
+				(unsigned long)pm, 0, MIGRATE_SYNC);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
@@ -1452,317 +1440,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
  	}
  	return err;
 }
-
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * Returns true if this is a safe migration target node for misplaced NUMA
- * pages. Currently it only checks the watermarks which crude
- */
-static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
-				   int nr_migrate_pages)
-{
-	int z;
-	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
-		struct zone *zone = pgdat->node_zones + z;
-
-		if (!populated_zone(zone))
-			continue;
-
-		if (zone->all_unreclaimable)
-			continue;
-
-		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
-		if (!zone_watermark_ok(zone, 0,
-				       high_wmark_pages(zone) +
-				       nr_migrate_pages,
-				       0, 0))
-			continue;
-		return true;
-	}
-	return false;
-}
-
-static struct page *alloc_misplaced_dst_page(struct page *page,
-					   unsigned long data,
-					   int **result)
-{
-	int nid = (int) data;
-	struct page *newpage;
-
-	newpage = alloc_pages_exact_node(nid,
-					 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
-					  __GFP_NOMEMALLOC | __GFP_NORETRY |
-					  __GFP_NOWARN) &
-					 ~GFP_IOFS, 0);
-	if (newpage)
-		page_xchg_last_nid(newpage, page_last_nid(page));
-
-	return newpage;
-}
-
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
-	pg_data_t *pgdat = NODE_DATA(node);
-
-	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
-				msecs_to_jiffies(pteupdate_interval_millisecs)))
-		return false;
-
-	if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
-		return false;
-
-	return true;
-}
-
-/* Returns true if the node is migrate rate-limited after the update */
-bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
-{
-	bool rate_limited = false;
-
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	spin_lock(&pgdat->numabalancing_migrate_lock);
-	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
-		pgdat->numabalancing_migrate_nr_pages = 0;
-		pgdat->numabalancing_migrate_next_window = jiffies +
-			msecs_to_jiffies(migrate_interval_millisecs);
-	}
-	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
-		rate_limited = true;
-	else
-		pgdat->numabalancing_migrate_nr_pages += nr_pages;
-	spin_unlock(&pgdat->numabalancing_migrate_lock);
-	
-	return rate_limited;
-}
-
-int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
-{
-	int ret = 0;
-
-	/* Avoid migrating to a node that is nearly full */
-	if (migrate_balanced_pgdat(pgdat, 1)) {
-		int page_lru;
-
-		if (isolate_lru_page(page)) {
-			put_page(page);
-			return 0;
-		}
-
-		/* Page is isolated */
-		ret = 1;
-		page_lru = page_is_file_cache(page);
-		if (!PageTransHuge(page))
-			inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
-		else
-			mod_zone_page_state(page_zone(page),
-					NR_ISOLATED_ANON + page_lru,
-					HPAGE_PMD_NR);
-	}
-
-	/*
-	 * Page is either isolated or there is not enough space on the target
-	 * node. If isolated, then it has taken a reference count and the
-	 * callers reference can be safely dropped without the page
-	 * disappearing underneath us during migration. Otherwise the page is
-	 * not to be migrated but the callers reference should still be
-	 * dropped so it does not leak.
-	 */
-	put_page(page);
-
-	return ret;
-}
-
-/*
- * Attempt to migrate a misplaced page to the specified destination
- * node. Caller is expected to have an elevated reference count on
- * the page that will be dropped by this function before returning.
- */
-int migrate_misplaced_page(struct page *page, int node)
-{
-	pg_data_t *pgdat = NODE_DATA(node);
-	int isolated = 0;
-	int nr_remaining;
-	LIST_HEAD(migratepages);
-
-	/*
-	 * Don't migrate pages that are mapped in multiple processes.
-	 * TODO: Handle false sharing detection instead of this hammer
-	 */
-	if (page_mapcount(page) != 1) {
-		put_page(page);
-		goto out;
-	}
-
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (numamigrate_update_ratelimit(pgdat, 1)) {
-		put_page(page);
-		goto out;
-	}
-
-	isolated = numamigrate_isolate_page(pgdat, page);
-	if (!isolated)
-		goto out;
-
-	list_add(&page->lru, &migratepages);
-	nr_remaining = migrate_pages(&migratepages,
-			alloc_misplaced_dst_page,
-			node, false, MIGRATE_ASYNC,
-			MR_NUMA_MISPLACED);
-	if (nr_remaining) {
-		putback_lru_pages(&migratepages);
-		isolated = 0;
-	} else
-		count_vm_numa_event(NUMA_PAGE_MIGRATE);
-	BUG_ON(!list_empty(&migratepages));
-out:
-	return isolated;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-				struct vm_area_struct *vma,
-				pmd_t *pmd, pmd_t entry,
-				unsigned long address,
-				struct page *page, int node)
-{
-	unsigned long haddr = address & HPAGE_PMD_MASK;
-	pg_data_t *pgdat = NODE_DATA(node);
-	int isolated = 0;
-	struct page *new_page = NULL;
-	struct mem_cgroup *memcg = NULL;
-	int page_lru = page_is_file_cache(page);
-
-	/*
-	 * Don't migrate pages that are mapped in multiple processes.
-	 * TODO: Handle false sharing detection instead of this hammer
-	 */
-	if (page_mapcount(page) != 1)
-		goto out_dropref;
-
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
-		goto out_dropref;
-
-	new_page = alloc_pages_node(node,
-		(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
-	if (!new_page) {
-		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-		goto out_dropref;
-	}
-	page_xchg_last_nid(new_page, page_last_nid(page));
-
-	isolated = numamigrate_isolate_page(pgdat, page);
-	if (!isolated) {
-		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-		put_page(new_page);
-		goto out_keep_locked;
-	}
-
-	/* Prepare a page as a migration target */
-	__set_page_locked(new_page);
-	SetPageSwapBacked(new_page);
-
-	/* anon mapping, we can simply copy page->mapping to the new page: */
-	new_page->mapping = page->mapping;
-	new_page->index = page->index;
-	migrate_page_copy(new_page, page);
-	WARN_ON(PageLRU(new_page));
-
-	/* Recheck the target PMD */
-	spin_lock(&mm->page_table_lock);
-	if (unlikely(!pmd_same(*pmd, entry))) {
-		spin_unlock(&mm->page_table_lock);
-
-		/* Reverse changes made by migrate_page_copy() */
-		if (TestClearPageActive(new_page))
-			SetPageActive(page);
-		if (TestClearPageUnevictable(new_page))
-			SetPageUnevictable(page);
-		mlock_migrate_page(page, new_page);
-
-		unlock_page(new_page);
-		put_page(new_page);		/* Free it */
-
-		unlock_page(page);
-		putback_lru_page(page);
-
-		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-		goto out;
-	}
-
-	/*
-	 * Traditional migration needs to prepare the memcg charge
-	 * transaction early to prevent the old page from being
-	 * uncharged when installing migration entries.  Here we can
-	 * save the potential rollback and start the charge transfer
-	 * only when migration is already known to end successfully.
-	 */
-	mem_cgroup_prepare_migration(page, new_page, &memcg);
-
-	entry = mk_pmd(new_page, vma->vm_page_prot);
-	entry = pmd_mknonnuma(entry);
-	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-	entry = pmd_mkhuge(entry);
-
-	page_add_new_anon_rmap(new_page, vma, haddr);
-
-	set_pmd_at(mm, haddr, pmd, entry);
-	update_mmu_cache_pmd(vma, address, entry);
-	page_remove_rmap(page);
-	/*
-	 * Finish the charge transaction under the page table lock to
-	 * prevent split_huge_page() from dividing up the charge
-	 * before it's fully transferred to the new page.
-	 */
-	mem_cgroup_end_migration(memcg, page, new_page, true);
-	spin_unlock(&mm->page_table_lock);
-
-	unlock_page(new_page);
-	unlock_page(page);
-	put_page(page);			/* Drop the rmap reference */
-	put_page(page);			/* Drop the LRU isolation reference */
-
-	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
-	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-
-out:
-	mod_zone_page_state(page_zone(page),
-			NR_ISOLATED_ANON + page_lru,
-			-HPAGE_PMD_NR);
-	return isolated;
-
-out_dropref:
-	put_page(page);
-out_keep_locked:
-	return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-#endif /* CONFIG_NUMA */
+#endif
diff --git a/trunk/mm/mmap.c b/trunk/mm/mmap.c
index f54b235f29a9..2b7d9e78a569 100644
--- a/trunk/mm/mmap.c
+++ b/trunk/mm/mmap.c
@@ -736,7 +736,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 	if (anon_vma) {
 		VM_BUG_ON(adjust_next && next->anon_vma &&
 			  anon_vma != next->anon_vma);
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock(anon_vma);
 		anon_vma_interval_tree_pre_update_vma(vma);
 		if (adjust_next)
 			anon_vma_interval_tree_pre_update_vma(next);
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
-		down_write(&anon_vma->root->rwsem);
+		mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
 		/*
 		 * We can safely modify head.next after taking the
-		 * anon_vma->root->rwsem. If some other vma in this mm shares
+		 * anon_vma->root->mutex. If some other vma in this mm shares
 		 * the same anon_vma we won't take it again.
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us thanks to the
-		 * anon_vma->root->rwsem.
+		 * anon_vma->root->mutex.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
 				       &anon_vma->root->rb_root.rb_node))
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us until we release the
-		 * anon_vma->root->rwsem.
+		 * anon_vma->root->mutex.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
 					  &anon_vma->root->rb_root.rb_node))
diff --git a/trunk/mm/mprotect.c b/trunk/mm/mprotect.c
index 3dca970367db..e8c3938db6fa 100644
--- a/trunk/mm/mprotect.c
+++ b/trunk/mm/mprotect.c
@@ -35,16 +35,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 }
 #endif
 
-static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable, int prot_numa, bool *ret_all_same_node)
+		int dirty_accountable)
 {
-	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte, oldpte;
 	spinlock_t *ptl;
-	unsigned long pages = 0;
-	bool all_same_node = true;
-	int last_nid = -1;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -52,43 +48,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
 			pte_t ptent;
-			bool updated = false;
 
 			ptent = ptep_modify_prot_start(mm, addr, pte);
-			if (!prot_numa) {
-				ptent = pte_modify(ptent, newprot);
-				updated = true;
-			} else {
-				struct page *page;
-
-				page = vm_normal_page(vma, addr, oldpte);
-				if (page) {
-					int this_nid = page_to_nid(page);
-					if (last_nid == -1)
-						last_nid = this_nid;
-					if (last_nid != this_nid)
-						all_same_node = false;
-
-					/* only check non-shared pages */
-					if (!pte_numa(oldpte) &&
-					    page_mapcount(page) == 1) {
-						ptent = pte_mknuma(ptent);
-						updated = true;
-					}
-				}
-			}
+			ptent = pte_modify(ptent, newprot);
 
 			/*
 			 * Avoid taking write faults for pages we know to be
 			 * dirty.
 			 */
-			if (dirty_accountable && pte_dirty(ptent)) {
+			if (dirty_accountable && pte_dirty(ptent))
 				ptent = pte_mkwrite(ptent);
-				updated = true;
-			}
 
-			if (updated)
-				pages++;
 			ptep_modify_prot_commit(mm, addr, pte, ptent);
 		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -102,40 +72,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				set_pte_at(mm, addr, pte,
 					swp_entry_to_pte(entry));
 			}
-			pages++;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
-
-	*ret_all_same_node = all_same_node;
-	return pages;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-		pmd_t *pmd)
-{
-	spin_lock(&mm->page_table_lock);
-	set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
-	spin_unlock(&mm->page_table_lock);
-}
-#else
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-		pmd_t *pmd)
-{
-	BUG();
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable, int prot_numa)
+		int dirty_accountable)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	unsigned long pages = 0;
-	bool all_same_node;
 
 	pmd = pmd_offset(pud, addr);
 	do {
@@ -143,59 +91,42 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
 		if (pmd_trans_huge(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE)
 				split_huge_page_pmd(vma, addr, pmd);
-			else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
-				pages += HPAGE_PMD_NR;
+			else if (change_huge_pmd(vma, pmd, addr, newprot))
 				continue;
-			}
 			/* fall through */
 		}
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		pages += change_pte_range(vma, pmd, addr, next, newprot,
-				 dirty_accountable, prot_numa, &all_same_node);
-
-		/*
-		 * If we are changing protections for NUMA hinting faults then
-		 * set pmd_numa if the examined pages were all on the same
-		 * node. This allows a regular PMD to be handled as one fault
-		 * and effectively batches the taking of the PTL
-		 */
-		if (prot_numa && all_same_node)
-			change_pmd_protnuma(vma->vm_mm, addr, pmd);
+		change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+				 dirty_accountable);
 	} while (pmd++, addr = next, addr != end);
-
-	return pages;
 }
 
-static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable, int prot_numa)
+		int dirty_accountable)
 {
 	pud_t *pud;
 	unsigned long next;
-	unsigned long pages = 0;
 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		pages += change_pmd_range(vma, pud, addr, next, newprot,
-				 dirty_accountable, prot_numa);
+		change_pmd_range(vma, pud, addr, next, newprot,
+				 dirty_accountable);
 	} while (pud++, addr = next, addr != end);
-
-	return pages;
 }
 
-static unsigned long change_protection_range(struct vm_area_struct *vma,
+static void change_protection(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable, int prot_numa)
+		int dirty_accountable)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long start = addr;
-	unsigned long pages = 0;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
@@ -204,32 +135,10 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		pages += change_pud_range(vma, pgd, addr, next, newprot,
-				 dirty_accountable, prot_numa);
+		change_pud_range(vma, pgd, addr, next, newprot,
+				 dirty_accountable);
 	} while (pgd++, addr = next, addr != end);
-
-	/* Only flush the TLB if we actually modified any entries: */
-	if (pages)
-		flush_tlb_range(vma, start, end);
-
-	return pages;
-}
-
-unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
-		       unsigned long end, pgprot_t newprot,
-		       int dirty_accountable, int prot_numa)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long pages;
-
-	mmu_notifier_invalidate_range_start(mm, start, end);
-	if (is_vm_hugetlb_page(vma))
-		pages = hugetlb_change_protection(vma, start, end, newprot);
-	else
-		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
-	mmu_notifier_invalidate_range_end(mm, start, end);
-
-	return pages;
+	flush_tlb_range(vma, start, end);
 }
 
 int
@@ -304,8 +213,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		dirty_accountable = 1;
 	}
 
-	change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
-
+	mmu_notifier_invalidate_range_start(mm, start, end);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
+	else
+		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+	mmu_notifier_invalidate_range_end(mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	perf_event_mmap(vma);
diff --git a/trunk/mm/mremap.c b/trunk/mm/mremap.c
index e1031e1f6a61..eabb24da6c9e 100644
--- a/trunk/mm/mremap.c
+++ b/trunk/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		}
 		if (vma->anon_vma) {
 			anon_vma = vma->anon_vma;
-			anon_vma_lock_write(anon_vma);
+			anon_vma_lock(anon_vma);
 		}
 	}
 
diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c
index d037c8bc1512..83637dfba110 100644
--- a/trunk/mm/page_alloc.c
+++ b/trunk/mm/page_alloc.c
@@ -611,7 +611,6 @@ static inline int free_pages_check(struct page *page)
 		bad_page(page);
 		return 1;
 	}
-	reset_page_last_nid(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
@@ -3884,7 +3883,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
-		reset_page_last_nid(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
@@ -4528,11 +4526,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	int ret;
 
 	pgdat_resize_init(pgdat);
-#ifdef CONFIG_NUMA_BALANCING
-	spin_lock_init(&pgdat->numabalancing_migrate_lock);
-	pgdat->numabalancing_migrate_nr_pages = 0;
-	pgdat->numabalancing_migrate_next_window = jiffies;
-#endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat_page_cgroup_init(pgdat);
@@ -5807,8 +5800,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
 		ret = migrate_pages(&cc->migratepages,
 				    alloc_migrate_target,
-				    0, false, MIGRATE_SYNC,
-				    MR_CMA);
+				    0, false, MIGRATE_SYNC);
 	}
 
 	putback_movable_pages(&cc->migratepages);
diff --git a/trunk/mm/pgtable-generic.c b/trunk/mm/pgtable-generic.c
index 0c8323fe6c8f..e642627da6b7 100644
--- a/trunk/mm/pgtable-generic.c
+++ b/trunk/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
 
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
- * Only sets the access flags (dirty, accessed), as well as write 
- * permission. Furthermore, we know it always gets set to a "more
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
  * permissive" setting, which allows most architectures to optimize
  * this. We return whether the PTE actually changed, which in turn
  * instructs the caller to do things like update__mmu_cache.  This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	int changed = !pte_same(*ptep, entry);
 	if (changed) {
 		set_pte_at(vma->vm_mm, address, ptep, entry);
-		flush_tlb_fix_spurious_fault(vma, address);
+		flush_tlb_page(vma, address);
 	}
 	return changed;
 }
@@ -88,8 +88,7 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
 	pte_t pte;
 	pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
-	if (pte_accessible(pte))
-		flush_tlb_page(vma, address);
+	flush_tlb_page(vma, address);
 	return pte;
 }
 #endif
diff --git a/trunk/mm/rmap.c b/trunk/mm/rmap.c
index 2c78f8cadc95..face808a489e 100644
--- a/trunk/mm/rmap.c
+++ b/trunk/mm/rmap.c
@@ -24,7 +24,7 @@
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
  *       mapping->i_mmap_mutex
- *         anon_vma->rwsem
+ *         anon_vma->mutex
  *           mm->page_table_lock or pte_lock
  *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *             swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           within bdi.wb->list_lock in __sync_single_inode)
  *
- * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
+ * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
  *     pte map lock
  */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
 	VM_BUG_ON(atomic_read(&anon_vma->refcount));
 
 	/*
-	 * Synchronize against page_lock_anon_vma_read() such that
+	 * Synchronize against page_lock_anon_vma() such that
 	 * we can safely hold the lock without the anon_vma getting
 	 * freed.
 	 *
 	 * Relies on the full mb implied by the atomic_dec_and_test() from
 	 * put_anon_vma() against the acquire barrier implied by
-	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
+	 * mutex_trylock() from page_lock_anon_vma(). This orders:
 	 *
-	 * page_lock_anon_vma_read()	VS	put_anon_vma()
-	 *   down_read_trylock()		  atomic_dec_and_test()
+	 * page_lock_anon_vma()		VS	put_anon_vma()
+	 *   mutex_trylock()			  atomic_dec_and_test()
 	 *   LOCK				  MB
-	 *   atomic_read()			  rwsem_is_locked()
+	 *   atomic_read()			  mutex_is_locked()
 	 *
 	 * LOCK should suffice since the actual taking of the lock must
 	 * happen _before_ what follows.
 	 */
-	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
-		anon_vma_lock_write(anon_vma);
+	if (mutex_is_locked(&anon_vma->root->mutex)) {
+		anon_vma_lock(anon_vma);
 		anon_vma_unlock(anon_vma);
 	}
 
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
  * allocate a new one.
  *
  * Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma_read()
+ * optimistically looked up an anon_vma in page_lock_anon_vma()
  * and that may actually touch the spinlock even in the newly
  * allocated vma (it depends on RCU to make sure that the
  * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			allocated = anon_vma;
 		}
 
-		anon_vma_lock_write(anon_vma);
+		anon_vma_lock(anon_vma);
 		/* page_table_lock to protect against threads */
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
 	struct anon_vma *new_root = anon_vma->root;
 	if (new_root != root) {
 		if (WARN_ON_ONCE(root))
-			up_write(&root->rwsem);
+			mutex_unlock(&root->mutex);
 		root = new_root;
-		down_write(&root->rwsem);
+		mutex_lock(&root->mutex);
 	}
 	return root;
 }
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
 static inline void unlock_anon_vma_root(struct anon_vma *root)
 {
 	if (root)
-		up_write(&root->rwsem);
+		mutex_unlock(&root->mutex);
 }
 
 /*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	get_anon_vma(anon_vma->root);
 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
 	vma->anon_vma = anon_vma;
-	anon_vma_lock_write(anon_vma);
+	anon_vma_lock(anon_vma);
 	anon_vma_chain_link(vma, avc, anon_vma);
 	anon_vma_unlock(anon_vma);
 
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 	/*
 	 * Iterate the list once more, it now only contains empty and unlinked
 	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
-	 * needing to write-acquire the anon_vma->root->rwsem.
+	 * needing to acquire the anon_vma->root->mutex.
 	 */
 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 		struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
 {
 	struct anon_vma *anon_vma = data;
 
-	init_rwsem(&anon_vma->rwsem);
+	mutex_init(&anon_vma->mutex);
 	atomic_set(&anon_vma->refcount, 0);
 	anon_vma->rb_root = RB_ROOT;
 }
@@ -442,7 +442,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
  * reference like with page_get_anon_vma() and then block on the mutex.
  */
-struct anon_vma *page_lock_anon_vma_read(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma = NULL;
 	struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
 
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
 	root_anon_vma = ACCESS_ONCE(anon_vma->root);
-	if (down_read_trylock(&root_anon_vma->rwsem)) {
+	if (mutex_trylock(&root_anon_vma->mutex)) {
 		/*
 		 * If the page is still mapped, then this anon_vma is still
 		 * its anon_vma, and holding the mutex ensures that it will
 		 * not go away, see anon_vma_free().
 		 */
 		if (!page_mapped(page)) {
-			up_read(&root_anon_vma->rwsem);
+			mutex_unlock(&root_anon_vma->mutex);
 			anon_vma = NULL;
 		}
 		goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
 
 	/* we pinned the anon_vma, its safe to sleep */
 	rcu_read_unlock();
-	anon_vma_lock_read(anon_vma);
+	anon_vma_lock(anon_vma);
 
 	if (atomic_dec_and_test(&anon_vma->refcount)) {
 		/*
 		 * Oops, we held the last refcount, release the lock
 		 * and bail -- can't simply use put_anon_vma() because
-		 * we'll deadlock on the anon_vma_lock_write() recursion.
+		 * we'll deadlock on the anon_vma_lock() recursion.
 		 */
-		anon_vma_unlock_read(anon_vma);
+		anon_vma_unlock(anon_vma);
 		__put_anon_vma(anon_vma);
 		anon_vma = NULL;
 	}
@@ -504,9 +504,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
 	return anon_vma;
 }
 
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	anon_vma_unlock_read(anon_vma);
+	anon_vma_unlock(anon_vma);
 }
 
 /*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
 	struct anon_vma_chain *avc;
 	int referenced = 0;
 
-	anon_vma = page_lock_anon_vma_read(page);
+	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return referenced;
 
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
 			break;
 	}
 
-	page_unlock_anon_vma_read(anon_vma);
+	page_unlock_anon_vma(anon_vma);
 	return referenced;
 }
 
@@ -1315,7 +1315,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	/*
 	 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
 	 * unstable result and race. Plus, We can't wait here because
-	 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
+	 * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
 	 * if trylock failed, the page remain in evictable lru and later
 	 * vmscan could retry to move the page to unevictable lru if the
 	 * page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
-	anon_vma = page_lock_anon_vma_read(page);
+	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return ret;
 
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 			break;
 	}
 
-	page_unlock_anon_vma_read(anon_vma);
+	page_unlock_anon_vma(anon_vma);
 	return ret;
 }
 
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	int ret = SWAP_AGAIN;
 
 	/*
-	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
+	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
 	 * because that depends on page_mapped(); but not all its usages
 	 * are holding mmap_sem. Users without mmap_sem are required to
 	 * take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	anon_vma = page_anon_vma(page);
 	if (!anon_vma)
 		return ret;
-	anon_vma_lock_read(anon_vma);
+	anon_vma_lock(anon_vma);
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 		if (ret != SWAP_AGAIN)
 			break;
 	}
-	anon_vma_unlock_read(anon_vma);
+	anon_vma_unlock(anon_vma);
 	return ret;
 }
 
diff --git a/trunk/mm/vmstat.c b/trunk/mm/vmstat.c
index 9800306c8195..df14808f0a36 100644
--- a/trunk/mm/vmstat.c
+++ b/trunk/mm/vmstat.c
@@ -774,20 +774,10 @@ const char * const vmstat_text[] = {
 
 	"pgrotated",
 
-#ifdef CONFIG_NUMA_BALANCING
-	"numa_pte_updates",
-	"numa_hint_faults",
-	"numa_hint_faults_local",
-	"numa_pages_migrated",
-#endif
-#ifdef CONFIG_MIGRATION
-	"pgmigrate_success",
-	"pgmigrate_fail",
-#endif
 #ifdef CONFIG_COMPACTION
-	"compact_migrate_scanned",
-	"compact_free_scanned",
-	"compact_isolated",
+	"compact_blocks_moved",
+	"compact_pages_moved",
+	"compact_pagemigrate_failed",
 	"compact_stall",
 	"compact_fail",
 	"compact_success",
diff --git a/trunk/net/dns_resolver/dns_key.c b/trunk/net/dns_resolver/dns_key.c
index 0a69d0757795..8aa4b1115384 100644
--- a/trunk/net/dns_resolver/dns_key.c
+++ b/trunk/net/dns_resolver/dns_key.c
@@ -259,16 +259,20 @@ static int __init init_dns_resolver(void)
 	if (!cred)
 		return -ENOMEM;
 
-	keyring = keyring_alloc(".dns_resolver",
-				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
-				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	keyring = key_alloc(&key_type_keyring, ".dns_resolver",
+			    GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
 	}
 
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
 	ret = register_key_type(&key_type_dns_resolver);
 	if (ret < 0)
 		goto failed_put_key;
@@ -300,4 +304,3 @@ static void __exit exit_dns_resolver(void)
 module_init(init_dns_resolver)
 module_exit(exit_dns_resolver)
 MODULE_LICENSE("GPL");
-
diff --git a/trunk/security/keys/key.c b/trunk/security/keys/key.c
index 8fb7c7bd4657..a15c9da8f971 100644
--- a/trunk/security/keys/key.c
+++ b/trunk/security/keys/key.c
@@ -854,13 +854,13 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	/* if the client doesn't provide, decide on the permissions we want */
 	if (perm == KEY_PERM_UNDEF) {
 		perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
-		perm |= KEY_USR_VIEW;
+		perm |= KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_LINK | KEY_USR_SETATTR;
 
 		if (ktype->read)
-			perm |= KEY_POS_READ;
+			perm |= KEY_POS_READ | KEY_USR_READ;
 
 		if (ktype == &key_type_keyring || ktype->update)
-			perm |= KEY_POS_WRITE;
+			perm |= KEY_USR_WRITE;
 	}
 
 	/* allocate a new key */
diff --git a/trunk/security/keys/keyctl.c b/trunk/security/keys/keyctl.c
index 4b5c948eb414..5d34b4e827d6 100644
--- a/trunk/security/keys/keyctl.c
+++ b/trunk/security/keys/keyctl.c
@@ -1132,12 +1132,12 @@ long keyctl_instantiate_key_iov(key_serial_t id,
 	ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc,
 				    ARRAY_SIZE(iovstack), iovstack, &iov);
 	if (ret < 0)
-		goto err;
+		return ret;
 	if (ret == 0)
 		goto no_payload_free;
 
 	ret = keyctl_instantiate_key_common(id, iov, ioc, ret, ringid);
-err:
+
 	if (iov != iovstack)
 		kfree(iov);
 	return ret;
@@ -1495,8 +1495,7 @@ long keyctl_session_to_parent(void)
 		goto error_keyring;
 	newwork = &cred->rcu;
 
-	cred->session_keyring = key_ref_to_ptr(keyring_r);
-	keyring_r = NULL;
+	cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
 	init_task_work(newwork, key_change_session_keyring);
 
 	me = current;
@@ -1520,7 +1519,7 @@ long keyctl_session_to_parent(void)
 	mycred = current_cred();
 	pcred = __task_cred(parent);
 	if (mycred == pcred ||
-	    mycred->session_keyring == pcred->session_keyring) {
+	    mycred->tgcred->session_keyring == pcred->tgcred->session_keyring) {
 		ret = 0;
 		goto unlock;
 	}
@@ -1536,9 +1535,9 @@ long keyctl_session_to_parent(void)
 		goto unlock;
 
 	/* the keyrings must have the same UID */
-	if ((pcred->session_keyring &&
-	     !uid_eq(pcred->session_keyring->uid, mycred->euid)) ||
-	    !uid_eq(mycred->session_keyring->uid, mycred->euid))
+	if ((pcred->tgcred->session_keyring &&
+	     !uid_eq(pcred->tgcred->session_keyring->uid, mycred->euid)) ||
+	    !uid_eq(mycred->tgcred->session_keyring->uid, mycred->euid))
 		goto unlock;
 
 	/* cancel an already pending keyring replacement */
diff --git a/trunk/security/keys/keyring.c b/trunk/security/keys/keyring.c
index 6ece7f2e5707..6e42df15a24c 100644
--- a/trunk/security/keys/keyring.c
+++ b/trunk/security/keys/keyring.c
@@ -257,14 +257,17 @@ static long keyring_read(const struct key *keyring,
  * Allocate a keyring and link into the destination keyring.
  */
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
-			  const struct cred *cred, key_perm_t perm,
-			  unsigned long flags, struct key *dest)
+			  const struct cred *cred, unsigned long flags,
+			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, cred, perm, flags);
+			    uid, gid, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
+			    flags);
+
 	if (!IS_ERR(keyring)) {
 		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
@@ -275,7 +278,6 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 
 	return keyring;
 }
-EXPORT_SYMBOL(keyring_alloc);
 
 /**
  * keyring_search_aux - Search a keyring tree for a key matching some criteria
diff --git a/trunk/security/keys/process_keys.c b/trunk/security/keys/process_keys.c
index 58dfe0890947..86468f385fc8 100644
--- a/trunk/security/keys/process_keys.c
+++ b/trunk/security/keys/process_keys.c
@@ -45,12 +45,10 @@ int install_user_keyrings(void)
 	struct user_struct *user;
 	const struct cred *cred;
 	struct key *uid_keyring, *session_keyring;
-	key_perm_t user_keyring_perm;
 	char buf[20];
 	int ret;
 	uid_t uid;
 
-	user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;
 	cred = current_cred();
 	user = cred->user;
 	uid = from_kuid(cred->user_ns, user->uid);
@@ -75,8 +73,8 @@ int install_user_keyrings(void)
 		uid_keyring = find_keyring_by_name(buf, true);
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID,
-						    cred, user_keyring_perm,
-						    KEY_ALLOC_IN_QUOTA, NULL);
+						    cred, KEY_ALLOC_IN_QUOTA,
+						    NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
 				goto error;
@@ -91,8 +89,7 @@ int install_user_keyrings(void)
 		if (IS_ERR(session_keyring)) {
 			session_keyring =
 				keyring_alloc(buf, user->uid, INVALID_GID,
-					      cred, user_keyring_perm,
-					      KEY_ALLOC_IN_QUOTA, NULL);
+					      cred, KEY_ALLOC_IN_QUOTA, NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -133,7 +130,6 @@ int install_thread_keyring_to_cred(struct cred *new)
 	struct key *keyring;
 
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
-				KEY_POS_ALL | KEY_USR_VIEW,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	if (IS_ERR(keyring))
 		return PTR_ERR(keyring);
@@ -174,18 +170,27 @@ static int install_thread_keyring(void)
 int install_process_keyring_to_cred(struct cred *new)
 {
 	struct key *keyring;
+	int ret;
 
-	if (new->process_keyring)
+	if (new->tgcred->process_keyring)
 		return -EEXIST;
 
-	keyring = keyring_alloc("_pid", new->uid, new->gid, new,
-				KEY_POS_ALL | KEY_USR_VIEW,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	keyring = keyring_alloc("_pid", new->uid, new->gid,
+				new, KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	if (IS_ERR(keyring))
 		return PTR_ERR(keyring);
 
-	new->process_keyring = keyring;
-	return 0;
+	spin_lock_irq(&new->tgcred->lock);
+	if (!new->tgcred->process_keyring) {
+		new->tgcred->process_keyring = keyring;
+		keyring = NULL;
+		ret = 0;
+	} else {
+		ret = -EEXIST;
+	}
+	spin_unlock_irq(&new->tgcred->lock);
+	key_put(keyring);
+	return ret;
 }
 
 /*
@@ -226,12 +231,11 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 	/* create an empty session keyring */
 	if (!keyring) {
 		flags = KEY_ALLOC_QUOTA_OVERRUN;
-		if (cred->session_keyring)
+		if (cred->tgcred->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
-					KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
-					flags, NULL);
+		keyring = keyring_alloc("_ses", cred->uid, cred->gid,
+					cred, flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	} else {
@@ -239,11 +243,17 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 	}
 
 	/* install the keyring */
-	old = cred->session_keyring;
-	rcu_assign_pointer(cred->session_keyring, keyring);
-
-	if (old)
+	spin_lock_irq(&cred->tgcred->lock);
+	old = cred->tgcred->session_keyring;
+	rcu_assign_pointer(cred->tgcred->session_keyring, keyring);
+	spin_unlock_irq(&cred->tgcred->lock);
+
+	/* we're using RCU on the pointer, but there's no point synchronising
+	 * on it if it didn't previously point to anything */
+	if (old) {
+		synchronize_rcu();
 		key_put(old);
+	}
 
 	return 0;
 }
@@ -358,9 +368,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
 	}
 
 	/* search the process keyring second */
-	if (cred->process_keyring) {
+	if (cred->tgcred->process_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(cred->process_keyring, 1),
+			make_key_ref(cred->tgcred->process_keyring, 1),
 			cred, type, description, match, no_state_check);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -379,10 +389,12 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
 	}
 
 	/* search the session keyring */
-	if (cred->session_keyring) {
+	if (cred->tgcred->session_keyring) {
 		rcu_read_lock();
 		key_ref = keyring_search_aux(
-			make_key_ref(rcu_dereference(cred->session_keyring), 1),
+			make_key_ref(rcu_dereference(
+					     cred->tgcred->session_keyring),
+				     1),
 			cred, type, description, match, no_state_check);
 		rcu_read_unlock();
 
@@ -552,7 +564,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!cred->process_keyring) {
+		if (!cred->tgcred->process_keyring) {
 			if (!(lflags & KEY_LOOKUP_CREATE))
 				goto error;
 
@@ -564,13 +576,13 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
 			goto reget_creds;
 		}
 
-		key = cred->process_keyring;
+		key = cred->tgcred->process_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!cred->session_keyring) {
+		if (!cred->tgcred->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
 			ret = install_user_keyrings();
@@ -585,7 +597,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
 			if (ret < 0)
 				goto error;
 			goto reget_creds;
-		} else if (cred->session_keyring ==
+		} else if (cred->tgcred->session_keyring ==
 			   cred->user->session_keyring &&
 			   lflags & KEY_LOOKUP_CREATE) {
 			ret = join_session_keyring(NULL);
@@ -595,7 +607,7 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
 		}
 
 		rcu_read_lock();
-		key = rcu_dereference(cred->session_keyring);
+		key = rcu_dereference(cred->tgcred->session_keyring);
 		atomic_inc(&key->usage);
 		rcu_read_unlock();
 		key_ref = make_key_ref(key, 1);
@@ -755,6 +767,12 @@ long join_session_keyring(const char *name)
 	struct key *keyring;
 	long ret, serial;
 
+	/* only permit this if there's a single thread in the thread group -
+	 * this avoids us having to adjust the creds on all threads and risking
+	 * ENOMEM */
+	if (!current_is_single_threaded())
+		return -EMLINK;
+
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
@@ -766,7 +784,7 @@ long join_session_keyring(const char *name)
 		if (ret < 0)
 			goto error;
 
-		serial = new->session_keyring->serial;
+		serial = new->tgcred->session_keyring->serial;
 		ret = commit_creds(new);
 		if (ret == 0)
 			ret = serial;
@@ -780,10 +798,8 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(
-			name, old->uid, old->gid, old,
-			KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
-			KEY_ALLOC_IN_QUOTA, NULL);
+		keyring = keyring_alloc(name, old->uid, old->gid, old,
+					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
@@ -791,9 +807,6 @@ long join_session_keyring(const char *name)
 	} else if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
-	} else if (keyring == new->session_keyring) {
-		ret = 0;
-		goto error2;
 	}
 
 	/* we've got a keyring - now to install it */
@@ -850,7 +863,8 @@ void key_change_session_keyring(struct callback_head *twork)
 
 	new->jit_keyring	= old->jit_keyring;
 	new->thread_keyring	= key_get(old->thread_keyring);
-	new->process_keyring	= key_get(old->process_keyring);
+	new->tgcred->tgid	= old->tgcred->tgid;
+	new->tgcred->process_keyring = key_get(old->tgcred->process_keyring);
 
 	security_transfer_creds(new, old);
 
diff --git a/trunk/security/keys/request_key.c b/trunk/security/keys/request_key.c
index 4bd6bdb74193..66e21184b559 100644
--- a/trunk/security/keys/request_key.c
+++ b/trunk/security/keys/request_key.c
@@ -126,7 +126,6 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	cred = get_current_cred();
 	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
-				KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	put_cred(cred);
 	if (IS_ERR(keyring)) {
@@ -151,12 +150,12 @@ static int call_sbin_request_key(struct key_construction *cons,
 		cred->thread_keyring ? cred->thread_keyring->serial : 0);
 
 	prkey = 0;
-	if (cred->process_keyring)
-		prkey = cred->process_keyring->serial;
+	if (cred->tgcred->process_keyring)
+		prkey = cred->tgcred->process_keyring->serial;
 	sprintf(keyring_str[1], "%d", prkey);
 
 	rcu_read_lock();
-	session = rcu_dereference(cred->session_keyring);
+	session = rcu_dereference(cred->tgcred->session_keyring);
 	if (!session)
 		session = cred->user->session_keyring;
 	sskey = session->serial;
@@ -298,14 +297,14 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-			dest_keyring = key_get(cred->process_keyring);
+			dest_keyring = key_get(cred->tgcred->process_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_SESSION_KEYRING:
 			rcu_read_lock();
 			dest_keyring = key_get(
-				rcu_dereference(cred->session_keyring));
+				rcu_dereference(cred->tgcred->session_keyring));
 			rcu_read_unlock();
 
 			if (dest_keyring)
@@ -348,7 +347,6 @@ static int construct_alloc_key(struct key_type *type,
 	const struct cred *cred = current_cred();
 	unsigned long prealloc;
 	struct key *key;
-	key_perm_t perm;
 	key_ref_t key_ref;
 	int ret;
 
@@ -357,15 +355,8 @@ static int construct_alloc_key(struct key_type *type,
 	*_key = NULL;
 	mutex_lock(&user->cons_lock);
 
-	perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
-	perm |= KEY_USR_VIEW;
-	if (type->read)
-		perm |= KEY_POS_READ;
-	if (type == &key_type_keyring || type->update)
-		perm |= KEY_POS_WRITE;
-
 	key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred,
-			perm, flags);
+			KEY_POS_ALL, flags);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
diff --git a/trunk/security/smack/Kconfig b/trunk/security/smack/Kconfig
index e69de9c642b7..603b08784341 100644
--- a/trunk/security/smack/Kconfig
+++ b/trunk/security/smack/Kconfig
@@ -1,10 +1,6 @@
 config SECURITY_SMACK
 	bool "Simplified Mandatory Access Control Kernel Support"
-	depends on NET
-	depends on INET
-	depends on SECURITY
-	select NETLABEL
-	select SECURITY_NETWORK
+	depends on NETLABEL && SECURITY_NETWORK
 	default n
 	help
 	  This selects the Simplified Mandatory Access Control Kernel.
diff --git a/trunk/security/smack/smackfs.c b/trunk/security/smack/smackfs.c
index 76a5dca46404..99929a50093a 100644
--- a/trunk/security/smack/smackfs.c
+++ b/trunk/security/smack/smackfs.c
@@ -2063,19 +2063,6 @@ static const struct file_operations smk_revoke_subj_ops = {
 	.llseek		= generic_file_llseek,
 };
 
-static struct kset *smackfs_kset;
-/**
- * smk_init_sysfs - initialize /sys/fs/smackfs
- *
- */
-static int smk_init_sysfs(void)
-{
-	smackfs_kset = kset_create_and_add("smackfs", NULL, fs_kobj);
-	if (!smackfs_kset)
-		return -ENOMEM;
-	return 0;
-}
-
 /**
  * smk_fill_super - fill the /smackfs superblock
  * @sb: the empty superblock
@@ -2196,10 +2183,6 @@ static int __init init_smk_fs(void)
 	if (!security_module_enable(&smack_ops))
 		return 0;
 
-	err = smk_init_sysfs();
-	if (err)
-		printk(KERN_ERR "smackfs: sysfs mountpoint problem.\n");
-
 	err = register_filesystem(&smk_fs_type);
 	if (!err) {
 		smackfs_mount = kern_mount(&smk_fs_type);
diff --git a/trunk/security/yama/yama_lsm.c b/trunk/security/yama/yama_lsm.c
index 2663145d1197..b4c29848b49d 100644
--- a/trunk/security/yama/yama_lsm.c
+++ b/trunk/security/yama/yama_lsm.c
@@ -17,7 +17,6 @@
 #include <linux/ptrace.h>
 #include <linux/prctl.h>
 #include <linux/ratelimit.h>
-#include <linux/workqueue.h>
 
 #define YAMA_SCOPE_DISABLED	0
 #define YAMA_SCOPE_RELATIONAL	1
@@ -30,37 +29,12 @@ static int ptrace_scope = YAMA_SCOPE_RELATIONAL;
 struct ptrace_relation {
 	struct task_struct *tracer;
 	struct task_struct *tracee;
-	bool invalid;
 	struct list_head node;
-	struct rcu_head rcu;
 };
 
 static LIST_HEAD(ptracer_relations);
 static DEFINE_SPINLOCK(ptracer_relations_lock);
 
-static void yama_relation_cleanup(struct work_struct *work);
-static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
-
-/**
- * yama_relation_cleanup - remove invalid entries from the relation list
- *
- */
-static void yama_relation_cleanup(struct work_struct *work)
-{
-	struct ptrace_relation *relation;
-
-	spin_lock(&ptracer_relations_lock);
-	rcu_read_lock();
-	list_for_each_entry_rcu(relation, &ptracer_relations, node) {
-		if (relation->invalid) {
-			list_del_rcu(&relation->node);
-			kfree_rcu(relation, rcu);
-		}
-	}
-	rcu_read_unlock();
-	spin_unlock(&ptracer_relations_lock);
-}
-
 /**
  * yama_ptracer_add - add/replace an exception for this tracer/tracee pair
  * @tracer: the task_struct of the process doing the ptrace
@@ -74,34 +48,32 @@ static void yama_relation_cleanup(struct work_struct *work)
 static int yama_ptracer_add(struct task_struct *tracer,
 			    struct task_struct *tracee)
 {
-	struct ptrace_relation *relation, *added;
+	int rc = 0;
+	struct ptrace_relation *added;
+	struct ptrace_relation *entry, *relation = NULL;
 
 	added = kmalloc(sizeof(*added), GFP_KERNEL);
 	if (!added)
 		return -ENOMEM;
 
-	added->tracee = tracee;
-	added->tracer = tracer;
-	added->invalid = false;
-
-	spin_lock(&ptracer_relations_lock);
-	rcu_read_lock();
-	list_for_each_entry_rcu(relation, &ptracer_relations, node) {
-		if (relation->invalid)
-			continue;
-		if (relation->tracee == tracee) {
-			list_replace_rcu(&relation->node, &added->node);
-			kfree_rcu(relation, rcu);
-			goto out;
+	spin_lock_bh(&ptracer_relations_lock);
+	list_for_each_entry(entry, &ptracer_relations, node)
+		if (entry->tracee == tracee) {
+			relation = entry;
+			break;
 		}
+	if (!relation) {
+		relation = added;
+		relation->tracee = tracee;
+		list_add(&relation->node, &ptracer_relations);
 	}
+	relation->tracer = tracer;
 
-	list_add_rcu(&added->node, &ptracer_relations);
+	spin_unlock_bh(&ptracer_relations_lock);
+	if (added != relation)
+		kfree(added);
 
-out:
-	rcu_read_unlock();
-	spin_unlock(&ptracer_relations_lock);
-	return 0;
+	return rc;
 }
 
 /**
@@ -112,23 +84,16 @@ static int yama_ptracer_add(struct task_struct *tracer,
 static void yama_ptracer_del(struct task_struct *tracer,
 			     struct task_struct *tracee)
 {
-	struct ptrace_relation *relation;
-	bool marked = false;
+	struct ptrace_relation *relation, *safe;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(relation, &ptracer_relations, node) {
-		if (relation->invalid)
-			continue;
+	spin_lock_bh(&ptracer_relations_lock);
+	list_for_each_entry_safe(relation, safe, &ptracer_relations, node)
 		if (relation->tracee == tracee ||
 		    (tracer && relation->tracer == tracer)) {
-			relation->invalid = true;
-			marked = true;
+			list_del(&relation->node);
+			kfree(relation);
 		}
-	}
-	rcu_read_unlock();
-
-	if (marked)
-		schedule_work(&yama_relation_work);
+	spin_unlock_bh(&ptracer_relations_lock);
 }
 
 /**
@@ -252,22 +217,21 @@ static int ptracer_exception_found(struct task_struct *tracer,
 	struct task_struct *parent = NULL;
 	bool found = false;
 
+	spin_lock_bh(&ptracer_relations_lock);
 	rcu_read_lock();
 	if (!thread_group_leader(tracee))
 		tracee = rcu_dereference(tracee->group_leader);
-	list_for_each_entry_rcu(relation, &ptracer_relations, node) {
-		if (relation->invalid)
-			continue;
+	list_for_each_entry(relation, &ptracer_relations, node)
 		if (relation->tracee == tracee) {
 			parent = relation->tracer;
 			found = true;
 			break;
 		}
-	}
 
 	if (found && (parent == NULL || task_is_descendant(parent, tracer)))
 		rc = 1;
 	rcu_read_unlock();
+	spin_unlock_bh(&ptracer_relations_lock);
 
 	return rc;
 }