From f75dbe81131e32cf11795b38f7a225a0c09d2b79 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Wed, 18 Jan 2012 10:40:44 +1100
Subject: [PATCH] --- yaml --- r: 286075 b: refs/heads/master c:
 89879a7eb81f69e6f63bdb2a442fb765c46482c0 h: refs/heads/master i:   286073:
 0c7d71eff6d9710689b0283b8dcbeb1174b0a46a   286071:
 61932c9bdb80a5c6a5b7575f687a7e40650d53c5 v: v3

---
 [refs]                                     |    2 +-
 trunk/MAINTAINERS                          |    2 +-
 trunk/arch/arm/include/asm/kprobes.h       |    1 +
 trunk/arch/arm/include/asm/ptrace.h        |    5 -
 trunk/arch/arm/include/asm/thread_info.h   |    6 -
 trunk/arch/arm/kernel/entry-common.S       |    4 +-
 trunk/arch/arm/kernel/ptrace.c             |   16 +-
 trunk/arch/ia64/include/asm/ptrace.h       |   13 +-
 trunk/arch/ia64/kernel/ptrace.c            |   18 +-
 trunk/arch/microblaze/include/asm/ptrace.h |    5 -
 trunk/arch/microblaze/kernel/ptrace.c      |    9 +-
 trunk/arch/microblaze/kernel/setup.c       |   21 +-
 trunk/arch/mips/include/asm/ptrace.h       |   14 +-
 trunk/arch/mips/kernel/ptrace.c            |   11 +-
 trunk/arch/powerpc/include/asm/ptrace.h    |   13 +-
 trunk/arch/powerpc/kernel/ptrace.c         |   30 +-
 trunk/arch/s390/include/asm/ptrace.h       |    6 +-
 trunk/arch/s390/kernel/ptrace.c            |   15 +-
 trunk/arch/sh/include/asm/ptrace_32.h      |    5 +-
 trunk/arch/sh/include/asm/ptrace_64.h      |    5 +-
 trunk/arch/sh/kernel/ptrace_32.c           |   11 +-
 trunk/arch/sh/kernel/ptrace_64.c           |   11 +-
 trunk/arch/sparc/include/asm/ptrace.h      |   10 +-
 trunk/arch/sparc/kernel/ptrace_64.c        |   28 +-
 trunk/arch/um/kernel/ptrace.c              |   20 +-
 trunk/arch/x86/ia32/ia32entry.S            |   14 +-
 trunk/arch/x86/kernel/entry_32.S           |   10 +-
 trunk/arch/x86/kernel/entry_64.S           |   14 +-
 trunk/arch/x86/kernel/ptrace.c             |   25 +-
 trunk/arch/x86/kernel/vm86_32.c            |    4 +-
 trunk/arch/x86/um/shared/sysdep/ptrace.h   |    5 -
 trunk/arch/xtensa/kernel/ptrace.c          |    3 +-
 trunk/block/cfq-iosched.c                  |    7 +-
 trunk/drivers/usb/host/ehci-xilinx-of.c    |    2 +-
 trunk/drivers/xen/xen-balloon.c            |    2 +-
 trunk/fs/btrfs/Kconfig                     |   19 -
 trunk/fs/btrfs/Makefile                    |    3 +-
 trunk/fs/btrfs/backref.c                   | 1131 ++------
 trunk/fs/btrfs/backref.h                   |    5 -
 trunk/fs/btrfs/btrfs_inode.h               |    3 -
 trunk/fs/btrfs/check-integrity.c           | 3068 --------------------
 trunk/fs/btrfs/check-integrity.h           |   36 -
 trunk/fs/btrfs/ctree.c                     |   42 +-
 trunk/fs/btrfs/ctree.h                     |  239 +-
 trunk/fs/btrfs/delayed-inode.c             |   45 +-
 trunk/fs/btrfs/delayed-ref.c               |  153 +-
 trunk/fs/btrfs/delayed-ref.h               |  104 +-
 trunk/fs/btrfs/disk-io.c                   |  119 +-
 trunk/fs/btrfs/disk-io.h                   |    6 +-
 trunk/fs/btrfs/export.c                    |    2 +-
 trunk/fs/btrfs/extent-tree.c               |  465 +--
 trunk/fs/btrfs/extent_io.c                 |    6 +-
 trunk/fs/btrfs/extent_io.h                 |    2 -
 trunk/fs/btrfs/file.c                      |   11 +-
 trunk/fs/btrfs/free-space-cache.c          |  417 +--
 trunk/fs/btrfs/inode-map.c                 |    4 -
 trunk/fs/btrfs/inode.c                     |   66 +-
 trunk/fs/btrfs/ioctl.c                     |  268 +-
 trunk/fs/btrfs/ioctl.h                     |   54 -
 trunk/fs/btrfs/locking.c                   |   53 +-
 trunk/fs/btrfs/relocation.c                |   20 +-
 trunk/fs/btrfs/scrub.c                     |   12 +-
 trunk/fs/btrfs/super.c                     |  190 +-
 trunk/fs/btrfs/transaction.c               |   20 +-
 trunk/fs/btrfs/tree-log.c                  |    2 +-
 trunk/fs/btrfs/ulist.c                     |  220 --
 trunk/fs/btrfs/ulist.h                     |   68 -
 trunk/fs/btrfs/volumes.c                   |  993 ++-----
 trunk/fs/btrfs/volumes.h                   |   54 +-
 trunk/fs/btrfs/xattr.c                     |    2 +-
 trunk/fs/namei.c                           |   28 +-
 trunk/fs/proc/base.c                       |  150 +-
 trunk/fs/xfs/xfs_aops.c                    |   29 +-
 trunk/fs/xfs/xfs_attr.c                    |    4 +
 trunk/fs/xfs/xfs_attr_leaf.c               |    9 +
 trunk/fs/xfs/xfs_bmap.c                    |  116 +-
 trunk/fs/xfs/xfs_dfrag.c                   |   43 +-
 trunk/fs/xfs/xfs_file.c                    |  184 +-
 trunk/fs/xfs/xfs_fs_subr.c                 |    2 +-
 trunk/fs/xfs/xfs_iget.c                    |   24 +-
 trunk/fs/xfs/xfs_inode.c                   |  193 +-
 trunk/fs/xfs/xfs_inode.h                   |  114 +-
 trunk/fs/xfs/xfs_inode_item.c              |    8 +-
 trunk/fs/xfs/xfs_iomap.c                   |   46 +-
 trunk/fs/xfs/xfs_iops.c                    |   46 +-
 trunk/fs/xfs/xfs_qm_syscalls.c             |    8 +-
 trunk/fs/xfs/xfs_super.c                   |    8 +
 trunk/fs/xfs/xfs_sync.c                    |    9 +-
 trunk/fs/xfs/xfs_trace.h                   |   29 +-
 trunk/fs/xfs/xfs_vnodeops.c                |   44 +-
 trunk/include/linux/audit.h                |  116 +-
 trunk/include/linux/kref.h                 |    1 -
 trunk/include/linux/ptrace.h               |   10 -
 trunk/include/linux/tty_driver.h           |    1 +
 trunk/include/trace/events/btrfs.h         |  203 --
 trunk/init/Kconfig                         |   16 +-
 trunk/kernel/audit.c                       |    4 +-
 trunk/kernel/audit.h                       |    6 +-
 trunk/kernel/auditfilter.c                 |   17 +-
 trunk/kernel/auditsc.c                     |  735 ++---
 trunk/kernel/capability.c                  |    2 +-
 trunk/kernel/exit.c                        |    3 +-
 trunk/kernel/fork.c                        |    2 +
 trunk/kernel/seccomp.c                     |    2 -
 trunk/security/integrity/ima/ima_audit.c   |    8 +-
 trunk/security/lsm_audit.c                 |   27 +-
 trunk/sound/core/Kconfig                   |   13 +-
 trunk/sound/pci/au88x0/au88x0.c            |   13 +-
 trunk/sound/pci/au88x0/au88x0.h            |    1 -
 trunk/sound/pci/au88x0/au88x0_pcm.c        |    1 -
 trunk/sound/pci/hda/hda_intel.c            |    1 -
 trunk/sound/pci/hda/patch_sigmatel.c       |    2 +-
 trunk/sound/pci/oxygen/xonar_wm87x6.c      |    1 -
 113 files changed, 2222 insertions(+), 8326 deletions(-)
 delete mode 100644 trunk/fs/btrfs/check-integrity.c
 delete mode 100644 trunk/fs/btrfs/check-integrity.h
 delete mode 100644 trunk/fs/btrfs/ulist.c
 delete mode 100644 trunk/fs/btrfs/ulist.h

diff --git a/[refs] b/[refs]
index 70eaeb510854..7f96309ad7cf 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: f429ee3b808118591d1f3cdf3c0d0793911a5677
+refs/heads/master: 89879a7eb81f69e6f63bdb2a442fb765c46482c0
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 2a90101309d1..ece8935025e3 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -5846,7 +5846,7 @@ F:	drivers/mmc/host/sdhci-spear.c
 SECURITY SUBSYSTEM
 M:	James Morris <jmorris@namei.org>
 L:	linux-security-module@vger.kernel.org (suggested Cc:)
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git
 W:	http://security.wiki.kernel.org/
 S:	Supported
 F:	security/
diff --git a/trunk/arch/arm/include/asm/kprobes.h b/trunk/arch/arm/include/asm/kprobes.h
index f82ec22eeb11..feec86768f9c 100644
--- a/trunk/arch/arm/include/asm/kprobes.h
+++ b/trunk/arch/arm/include/asm/kprobes.h
@@ -24,6 +24,7 @@
 #define MAX_INSN_SIZE			2
 #define MAX_STACK_SIZE			64	/* 32 would probably be OK */
 
+#define regs_return_value(regs)		((regs)->ARM_r0)
 #define flush_insn_slot(p)		do { } while (0)
 #define kretprobe_blacklist_size	0
 
diff --git a/trunk/arch/arm/include/asm/ptrace.h b/trunk/arch/arm/include/asm/ptrace.h
index 451808ba1211..96187ff58c24 100644
--- a/trunk/arch/arm/include/asm/ptrace.h
+++ b/trunk/arch/arm/include/asm/ptrace.h
@@ -189,11 +189,6 @@ static inline int valid_user_regs(struct pt_regs *regs)
 	return 0;
 }
 
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	return regs->ARM_r0;
-}
-
 #define instruction_pointer(regs)	(regs)->ARM_pc
 
 #ifdef CONFIG_SMP
diff --git a/trunk/arch/arm/include/asm/thread_info.h b/trunk/arch/arm/include/asm/thread_info.h
index d4c24d412a8d..0f30c3a78fc1 100644
--- a/trunk/arch/arm/include/asm/thread_info.h
+++ b/trunk/arch/arm/include/asm/thread_info.h
@@ -129,7 +129,6 @@ extern void vfp_flush_hwstate(struct thread_info *);
 /*
  * thread information flags:
  *  TIF_SYSCALL_TRACE	- syscall trace active
- *  TIF_SYSCAL_AUDIT	- syscall auditing active
  *  TIF_SIGPENDING	- signal pending
  *  TIF_NEED_RESCHED	- rescheduling necessary
  *  TIF_NOTIFY_RESUME	- callback before returning to user
@@ -140,7 +139,6 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_SYSCALL_TRACE	8
-#define TIF_SYSCALL_AUDIT	9
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
@@ -151,15 +149,11 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
-#define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 
-/* Checks for any syscall work in entry-common.S */
-#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT)
-
 /*
  * Change these and you break ASM code in entry-common.S
  */
diff --git a/trunk/arch/arm/kernel/entry-common.S b/trunk/arch/arm/kernel/entry-common.S
index 520889cf1b5b..b2a27b6b0046 100644
--- a/trunk/arch/arm/kernel/entry-common.S
+++ b/trunk/arch/arm/kernel/entry-common.S
@@ -87,7 +87,7 @@ ENTRY(ret_from_fork)
 	get_thread_info tsk
 	ldr	r1, [tsk, #TI_FLAGS]		@ check for syscall tracing
 	mov	why, #1
-	tst	r1, #_TIF_SYSCALL_WORK		@ are we tracing syscalls?
+	tst	r1, #_TIF_SYSCALL_TRACE		@ are we tracing syscalls?
 	beq	ret_slow_syscall
 	mov	r1, sp
 	mov	r0, #1				@ trace exit [IP = 1]
@@ -443,7 +443,7 @@ ENTRY(vector_swi)
 1:
 #endif
 
-	tst	r10, #_TIF_SYSCALL_WORK		@ are we tracing syscalls?
+	tst	r10, #_TIF_SYSCALL_TRACE		@ are we tracing syscalls?
 	bne	__sys_trace
 
 	cmp	scno, #NR_syscalls		@ check upper syscall limit
diff --git a/trunk/arch/arm/kernel/ptrace.c b/trunk/arch/arm/kernel/ptrace.c
index e1d5e1929fbd..483727ad6892 100644
--- a/trunk/arch/arm/kernel/ptrace.c
+++ b/trunk/arch/arm/kernel/ptrace.c
@@ -906,6 +906,11 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
 {
 	unsigned long ip;
 
+	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+		return scno;
+	if (!(current->ptrace & PT_PTRACED))
+		return scno;
+
 	/*
 	 * Save IP.  IP is used to denote syscall entry/exit:
 	 *  IP = 0 -> entry, = 1 -> exit
@@ -913,17 +918,6 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
 	ip = regs->ARM_ip;
 	regs->ARM_ip = why;
 
-	if (!ip)
-		audit_syscall_exit(regs);
-	else
-		audit_syscall_entry(AUDIT_ARCH_ARMEB, scno, regs->ARM_r0,
-				    regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
-
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return scno;
-	if (!(current->ptrace & PT_PTRACED))
-		return scno;
-
 	current_thread_info()->syscall = scno;
 
 	/* the 0x80 provides a way for the tracing parent to distinguish
diff --git a/trunk/arch/ia64/include/asm/ptrace.h b/trunk/arch/ia64/include/asm/ptrace.h
index 68c98f5b3ca6..f5cb27614e35 100644
--- a/trunk/arch/ia64/include/asm/ptrace.h
+++ b/trunk/arch/ia64/include/asm/ptrace.h
@@ -246,18 +246,7 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
 	return regs->ar_bspstore;
 }
 
-static inline int is_syscall_success(struct pt_regs *regs)
-{
-	return regs->r10 != -1;
-}
-
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	if (is_syscall_success(regs))
-		return regs->r8;
-	else
-		return -regs->r8;
-}
+#define regs_return_value(regs) ((regs)->r8)
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
diff --git a/trunk/arch/ia64/kernel/ptrace.c b/trunk/arch/ia64/kernel/ptrace.c
index dad91661ddf9..8848f43d819e 100644
--- a/trunk/arch/ia64/kernel/ptrace.c
+++ b/trunk/arch/ia64/kernel/ptrace.c
@@ -1246,8 +1246,15 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 	if (test_thread_flag(TIF_RESTORE_RSE))
 		ia64_sync_krbs();
 
+	if (unlikely(current->audit_context)) {
+		long syscall;
+		int arch;
 
-	audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
+		syscall = regs.r15;
+		arch = AUDIT_ARCH_IA64;
+
+		audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
+	}
 
 	return 0;
 }
@@ -1261,7 +1268,14 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
 	int step;
 
-	audit_syscall_exit(&regs);
+	if (unlikely(current->audit_context)) {
+		int success = AUDITSC_RESULT(regs.r10);
+		long result = regs.r8;
+
+		if (success != AUDITSC_SUCCESS)
+			result = -result;
+		audit_syscall_exit(success, result);
+	}
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/trunk/arch/microblaze/include/asm/ptrace.h b/trunk/arch/microblaze/include/asm/ptrace.h
index 94e92c805859..816bee64b196 100644
--- a/trunk/arch/microblaze/include/asm/ptrace.h
+++ b/trunk/arch/microblaze/include/asm/ptrace.h
@@ -61,11 +61,6 @@ struct pt_regs {
 #define instruction_pointer(regs)	((regs)->pc)
 #define profile_pc(regs)		instruction_pointer(regs)
 
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	return regs->r3;
-}
-
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
diff --git a/trunk/arch/microblaze/kernel/ptrace.c b/trunk/arch/microblaze/kernel/ptrace.c
index 6eb2aa927d89..043cb58f9c44 100644
--- a/trunk/arch/microblaze/kernel/ptrace.c
+++ b/trunk/arch/microblaze/kernel/ptrace.c
@@ -147,8 +147,10 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
-	audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
-			    regs->r7, regs->r8);
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(EM_MICROBLAZE, regs->r12,
+				    regs->r5, regs->r6,
+				    regs->r7, regs->r8);
 
 	return ret ?: regs->r12;
 }
@@ -157,7 +159,8 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/trunk/arch/microblaze/kernel/setup.c b/trunk/arch/microblaze/kernel/setup.c
index d4fc1a971779..604cd9dd1333 100644
--- a/trunk/arch/microblaze/kernel/setup.c
+++ b/trunk/arch/microblaze/kernel/setup.c
@@ -26,7 +26,6 @@
 #include <linux/cache.h>
 #include <linux/of_platform.h>
 #include <linux/dma-mapping.h>
-#include <linux/cpu.h>
 #include <asm/cacheflush.h>
 #include <asm/entry.h>
 #include <asm/cpuinfo.h>
@@ -227,23 +226,5 @@ static int __init setup_bus_notifier(void)
 
 	return 0;
 }
-arch_initcall(setup_bus_notifier);
-
-static DEFINE_PER_CPU(struct cpu, cpu_devices);
-
-static int __init topology_init(void)
-{
-	int i, ret;
-
-	for_each_present_cpu(i) {
-		struct cpu *c = &per_cpu(cpu_devices, i);
 
-		ret = register_cpu(c, i);
-		if (ret)
-			printk(KERN_WARNING "topology_init: register_cpu %d "
-						"failed (%d)\n", i, ret);
-	}
-
-	return 0;
-}
-subsys_initcall(topology_init);
+arch_initcall(setup_bus_notifier);
diff --git a/trunk/arch/mips/include/asm/ptrace.h b/trunk/arch/mips/include/asm/ptrace.h
index 4b7f5252d2fd..7b99c670e478 100644
--- a/trunk/arch/mips/include/asm/ptrace.h
+++ b/trunk/arch/mips/include/asm/ptrace.h
@@ -137,19 +137,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-static inline int is_syscall_success(struct pt_regs *regs)
-{
-	return !regs->regs[7];
-}
-
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	if (is_syscall_success(regs))
-		return regs->regs[2];
-	else
-		return -regs->regs[2];
-}
-
+#define regs_return_value(_regs) ((_regs)->regs[2])
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
diff --git a/trunk/arch/mips/kernel/ptrace.c b/trunk/arch/mips/kernel/ptrace.c
index 7786b608d932..4e6ea1ffad46 100644
--- a/trunk/arch/mips/kernel/ptrace.c
+++ b/trunk/arch/mips/kernel/ptrace.c
@@ -560,9 +560,10 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	}
 
 out:
-	audit_syscall_entry(audit_arch(), regs->regs[2],
-			    regs->regs[4], regs->regs[5],
-			    regs->regs[6], regs->regs[7]);
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(audit_arch(), regs->regs[2],
+				    regs->regs[4], regs->regs[5],
+				    regs->regs[6], regs->regs[7]);
 }
 
 /*
@@ -571,7 +572,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
+		                   -regs->regs[2]);
 
 	if (!(current->ptrace & PT_PTRACED))
 		return;
diff --git a/trunk/arch/powerpc/include/asm/ptrace.h b/trunk/arch/powerpc/include/asm/ptrace.h
index 78a205162fd7..48223f9b8728 100644
--- a/trunk/arch/powerpc/include/asm/ptrace.h
+++ b/trunk/arch/powerpc/include/asm/ptrace.h
@@ -86,18 +86,7 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-static inline int is_syscall_success(struct pt_regs *regs)
-{
-	return !(regs->ccr & 0x10000000);
-}
-
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	if (is_syscall_success(regs))
-		return regs->gpr[3];
-	else
-		return -regs->gpr[3];
-}
+#define regs_return_value(regs) ((regs)->gpr[3])
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/trunk/arch/powerpc/kernel/ptrace.c b/trunk/arch/powerpc/kernel/ptrace.c
index 5b43325402bc..5de73dbd15c7 100644
--- a/trunk/arch/powerpc/kernel/ptrace.c
+++ b/trunk/arch/powerpc/kernel/ptrace.c
@@ -1724,20 +1724,22 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gpr[0]);
 
+	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-	if (!is_32bit_task())
-		audit_syscall_entry(AUDIT_ARCH_PPC64,
-				    regs->gpr[0],
-				    regs->gpr[3], regs->gpr[4],
-				    regs->gpr[5], regs->gpr[6]);
-	else
+		if (!is_32bit_task())
+			audit_syscall_entry(AUDIT_ARCH_PPC64,
+					    regs->gpr[0],
+					    regs->gpr[3], regs->gpr[4],
+					    regs->gpr[5], regs->gpr[6]);
+		else
 #endif
-		audit_syscall_entry(AUDIT_ARCH_PPC,
-				    regs->gpr[0],
-				    regs->gpr[3] & 0xffffffff,
-				    regs->gpr[4] & 0xffffffff,
-				    regs->gpr[5] & 0xffffffff,
-				    regs->gpr[6] & 0xffffffff);
+			audit_syscall_entry(AUDIT_ARCH_PPC,
+					    regs->gpr[0],
+					    regs->gpr[3] & 0xffffffff,
+					    regs->gpr[4] & 0xffffffff,
+					    regs->gpr[5] & 0xffffffff,
+					    regs->gpr[6] & 0xffffffff);
+	}
 
 	return ret ?: regs->gpr[0];
 }
@@ -1746,7 +1748,9 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
+				   regs->result);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->result);
diff --git a/trunk/arch/s390/include/asm/ptrace.h b/trunk/arch/s390/include/asm/ptrace.h
index aeb77f017985..56da355678f4 100644
--- a/trunk/arch/s390/include/asm/ptrace.h
+++ b/trunk/arch/s390/include/asm/ptrace.h
@@ -541,13 +541,9 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
+#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	return regs->gprs[2];
-}
-
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
diff --git a/trunk/arch/s390/kernel/ptrace.c b/trunk/arch/s390/kernel/ptrace.c
index 9d82ed4bcb27..573bc29551ef 100644
--- a/trunk/arch/s390/kernel/ptrace.c
+++ b/trunk/arch/s390/kernel/ptrace.c
@@ -740,17 +740,20 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gprs[2]);
 
-	audit_syscall_entry(is_compat_task() ?
-				AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-			    regs->gprs[2], regs->orig_gpr2,
-			    regs->gprs[3], regs->gprs[4],
-			    regs->gprs[5]);
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(is_compat_task() ?
+					AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+				    regs->gprs[2], regs->orig_gpr2,
+				    regs->gprs[3], regs->gprs[4],
+				    regs->gprs[5]);
 	return ret ?: regs->gprs[2];
 }
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
+				   regs->gprs[2]);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->gprs[2]);
diff --git a/trunk/arch/sh/include/asm/ptrace_32.h b/trunk/arch/sh/include/asm/ptrace_32.h
index 2d3e906aa722..6c2239cca1a2 100644
--- a/trunk/arch/sh/include/asm/ptrace_32.h
+++ b/trunk/arch/sh/include/asm/ptrace_32.h
@@ -76,10 +76,7 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tra)
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	return regs->regs[0];
-}
+#define regs_return_value(_regs)	((_regs)->regs[0])
 
 #endif /* __KERNEL__ */
 
diff --git a/trunk/arch/sh/include/asm/ptrace_64.h b/trunk/arch/sh/include/asm/ptrace_64.h
index eb3fcceaf64b..bf9be7764d69 100644
--- a/trunk/arch/sh/include/asm/ptrace_64.h
+++ b/trunk/arch/sh/include/asm/ptrace_64.h
@@ -13,10 +13,7 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tregs[7])
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	return regs->regs[3];
-}
+#define regs_return_value(_regs)	((_regs)->regs[3])
 
 #endif /* __KERNEL__ */
 
diff --git a/trunk/arch/sh/kernel/ptrace_32.c b/trunk/arch/sh/kernel/ptrace_32.c
index a3e651563763..92b3c276339a 100644
--- a/trunk/arch/sh/kernel/ptrace_32.c
+++ b/trunk/arch/sh/kernel/ptrace_32.c
@@ -518,9 +518,10 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[0]);
 
-	audit_syscall_entry(audit_arch(), regs->regs[3],
-			    regs->regs[4], regs->regs[5],
-			    regs->regs[6], regs->regs[7]);
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(audit_arch(), regs->regs[3],
+				    regs->regs[4], regs->regs[5],
+				    regs->regs[6], regs->regs[7]);
 
 	return ret ?: regs->regs[0];
 }
@@ -529,7 +530,9 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
+				   regs->regs[0]);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[0]);
diff --git a/trunk/arch/sh/kernel/ptrace_64.c b/trunk/arch/sh/kernel/ptrace_64.c
index 3d0080b5c976..c8f97649f354 100644
--- a/trunk/arch/sh/kernel/ptrace_64.c
+++ b/trunk/arch/sh/kernel/ptrace_64.c
@@ -536,9 +536,10 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[9]);
 
-	audit_syscall_entry(audit_arch(), regs->regs[1],
-			    regs->regs[2], regs->regs[3],
-			    regs->regs[4], regs->regs[5]);
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(audit_arch(), regs->regs[1],
+				    regs->regs[2], regs->regs[3],
+				    regs->regs[4], regs->regs[5]);
 
 	return ret ?: regs->regs[9];
 }
@@ -547,7 +548,9 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
+				   regs->regs[9]);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[9]);
diff --git a/trunk/arch/sparc/include/asm/ptrace.h b/trunk/arch/sparc/include/asm/ptrace.h
index c00c3b5c2806..a0e1bcf843a1 100644
--- a/trunk/arch/sparc/include/asm/ptrace.h
+++ b/trunk/arch/sparc/include/asm/ptrace.h
@@ -207,15 +207,7 @@ do {	current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-static inline int is_syscall_success(struct pt_regs *regs)
-{
-	return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
-}
-
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	return regs->u_regs[UREG_I0];
-}
+#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
diff --git a/trunk/arch/sparc/kernel/ptrace_64.c b/trunk/arch/sparc/kernel/ptrace_64.c
index 9388844cd88c..96ee50a80661 100644
--- a/trunk/arch/sparc/kernel/ptrace_64.c
+++ b/trunk/arch/sparc/kernel/ptrace_64.c
@@ -1071,22 +1071,32 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-	audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-			     AUDIT_ARCH_SPARC :
-			     AUDIT_ARCH_SPARC64),
-			    regs->u_regs[UREG_G1],
-			    regs->u_regs[UREG_I0],
-			    regs->u_regs[UREG_I1],
-			    regs->u_regs[UREG_I2],
-			    regs->u_regs[UREG_I3]);
+	if (unlikely(current->audit_context) && !ret)
+		audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+				     AUDIT_ARCH_SPARC :
+				     AUDIT_ARCH_SPARC64),
+				    regs->u_regs[UREG_G1],
+				    regs->u_regs[UREG_I0],
+				    regs->u_regs[UREG_I1],
+				    regs->u_regs[UREG_I2],
+				    regs->u_regs[UREG_I3]);
 
 	return ret;
 }
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-	audit_syscall_exit(regs);
+#ifdef CONFIG_AUDITSYSCALL
+	if (unlikely(current->audit_context)) {
+		unsigned long tstate = regs->tstate;
+		int result = AUDITSC_SUCCESS;
 
+		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
+			result = AUDITSC_FAILURE;
+
+		audit_syscall_exit(result, regs->u_regs[UREG_I0]);
+	}
+#endif
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
diff --git a/trunk/arch/um/kernel/ptrace.c b/trunk/arch/um/kernel/ptrace.c
index 06b190390505..c9da32b0c707 100644
--- a/trunk/arch/um/kernel/ptrace.c
+++ b/trunk/arch/um/kernel/ptrace.c
@@ -167,15 +167,17 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 	int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
 	int tracesysgood;
 
-	if (!entryexit)
-		audit_syscall_entry(HOST_AUDIT_ARCH,
-				    UPT_SYSCALL_NR(regs),
-				    UPT_SYSCALL_ARG1(regs),
-				    UPT_SYSCALL_ARG2(regs),
-				    UPT_SYSCALL_ARG3(regs),
-				    UPT_SYSCALL_ARG4(regs));
-	else
-		audit_syscall_exit(regs);
+	if (unlikely(current->audit_context)) {
+		if (!entryexit)
+			audit_syscall_entry(HOST_AUDIT_ARCH,
+					    UPT_SYSCALL_NR(regs),
+					    UPT_SYSCALL_ARG1(regs),
+					    UPT_SYSCALL_ARG2(regs),
+					    UPT_SYSCALL_ARG3(regs),
+					    UPT_SYSCALL_ARG4(regs));
+		else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
+					UPT_SYSCALL_RET(regs));
+	}
 
 	/* Fake a debug trap */
 	if (is_singlestep)
diff --git a/trunk/arch/x86/ia32/ia32entry.S b/trunk/arch/x86/ia32/ia32entry.S
index e3e734005e19..1106261856c8 100644
--- a/trunk/arch/x86/ia32/ia32entry.S
+++ b/trunk/arch/x86/ia32/ia32entry.S
@@ -14,7 +14,6 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
-#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -190,7 +189,7 @@ sysexit_from_sys_call:
 	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
-	call __audit_syscall_entry
+	call audit_syscall_entry
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
@@ -207,13 +206,12 @@ sysexit_from_sys_call:
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	jbe 1f
-	movslq %eax, %rsi	/* if error sign extend to 64 bits */
-1:	setbe %al		/* 1 if error, 0 if not */
+	cmpl $0,%eax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	call __audit_syscall_exit
-	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
+	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
+	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
diff --git a/trunk/arch/x86/kernel/entry_32.S b/trunk/arch/x86/kernel/entry_32.S
index 79d97e68f042..4af9fd2450a5 100644
--- a/trunk/arch/x86/kernel/entry_32.S
+++ b/trunk/arch/x86/kernel/entry_32.S
@@ -42,7 +42,6 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -454,7 +453,7 @@ sysenter_audit:
 	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
-	call __audit_syscall_entry
+	call audit_syscall_entry
 	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
@@ -465,10 +464,11 @@ sysexit_audit:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
+	cmpl $0,%eax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
 	movzbl %al,%eax		/* zero-extend that */
-	call __audit_syscall_exit
+	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..940ba711fc28 100644
--- a/trunk/arch/x86/kernel/entry_64.S
+++ b/trunk/arch/x86/kernel/entry_64.S
@@ -55,7 +55,6 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
-#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -549,7 +548,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
-	 * We just call __audit_syscall_entry() directly, and then
+	 * We just call audit_syscall_entry() directly, and then
 	 * jump back to the normal fast path.
 	 */
 auditsys:
@@ -559,21 +558,22 @@ auditsys:
 	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call __audit_syscall_entry
+	call audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
 	/*
-	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
+	 * Return fast path for syscall audit.  Call audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
 	 */
 sysret_audit:
 	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
-	setbe %al		/* 1 if so, 0 if not */
+	cmpq $0,%rsi		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	call __audit_syscall_exit
+	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
diff --git a/trunk/arch/x86/kernel/ptrace.c b/trunk/arch/x86/kernel/ptrace.c
index 50267386b766..89a04c7b5bb6 100644
--- a/trunk/arch/x86/kernel/ptrace.c
+++ b/trunk/arch/x86/kernel/ptrace.c
@@ -1392,18 +1392,20 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (IS_IA32)
-		audit_syscall_entry(AUDIT_ARCH_I386,
-				    regs->orig_ax,
-				    regs->bx, regs->cx,
-				    regs->dx, regs->si);
+	if (unlikely(current->audit_context)) {
+		if (IS_IA32)
+			audit_syscall_entry(AUDIT_ARCH_I386,
+					    regs->orig_ax,
+					    regs->bx, regs->cx,
+					    regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-	else
-		audit_syscall_entry(AUDIT_ARCH_X86_64,
-				    regs->orig_ax,
-				    regs->di, regs->si,
-				    regs->dx, regs->r10);
+		else
+			audit_syscall_entry(AUDIT_ARCH_X86_64,
+					    regs->orig_ax,
+					    regs->di, regs->si,
+					    regs->dx, regs->r10);
 #endif
+	}
 
 	return ret ?: regs->orig_ax;
 }
@@ -1412,7 +1414,8 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
-	audit_syscall_exit(regs);
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->ax);
diff --git a/trunk/arch/x86/kernel/vm86_32.c b/trunk/arch/x86/kernel/vm86_32.c
index af17e1c966dc..863f8753ab0a 100644
--- a/trunk/arch/x86/kernel/vm86_32.c
+++ b/trunk/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call __audit_syscall_exit since we do not exit via the normal paths */
+	/*call audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
+		audit_syscall_exit(AUDITSC_RESULT(0), 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/trunk/arch/x86/um/shared/sysdep/ptrace.h b/trunk/arch/x86/um/shared/sysdep/ptrace.h
index 5ef9344a8b24..711b1621747f 100644
--- a/trunk/arch/x86/um/shared/sysdep/ptrace.h
+++ b/trunk/arch/x86/um/shared/sysdep/ptrace.h
@@ -3,8 +3,3 @@
 #else
 #include "ptrace_64.h"
 #endif
-
-static inline long regs_return_value(struct uml_pt_regs *regs)
-{
-	return UPT_SYSCALL_RET(regs);
-}
diff --git a/trunk/arch/xtensa/kernel/ptrace.c b/trunk/arch/xtensa/kernel/ptrace.c
index 2dff698ab02e..a0d042aa2967 100644
--- a/trunk/arch/xtensa/kernel/ptrace.c
+++ b/trunk/arch/xtensa/kernel/ptrace.c
@@ -334,7 +334,8 @@ void do_syscall_trace_enter(struct pt_regs *regs)
 		do_syscall_trace();
 
 #if 0
-	audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
diff --git a/trunk/block/cfq-iosched.c b/trunk/block/cfq-iosched.c
index ee55019066a1..163263ddd381 100644
--- a/trunk/block/cfq-iosched.c
+++ b/trunk/block/cfq-iosched.c
@@ -3117,17 +3117,18 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_queue *old_cfqq = cfqd->active_queue;
+
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
+	cfq_slice_expired(cfqd, 1);
 
 	/*
 	 * workload type is changed, don't save slice, otherwise preempt
 	 * doesn't happen
 	 */
-	if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq))
+	if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
 		cfqq->cfqg->saved_workload_slice = 0;
 
-	cfq_slice_expired(cfqd, 1);
-
 	/*
 	 * Put the new queue at the front of the of the current list,
 	 * so we know that it will be selected next.
diff --git a/trunk/drivers/usb/host/ehci-xilinx-of.c b/trunk/drivers/usb/host/ehci-xilinx-of.c
index 9c2cc4633894..32793ce3d9e9 100644
--- a/trunk/drivers/usb/host/ehci-xilinx-of.c
+++ b/trunk/drivers/usb/host/ehci-xilinx-of.c
@@ -183,7 +183,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op)
 	}
 
 	irq = irq_of_parse_and_map(dn, 0);
-	if (!irq) {
+	if (irq == NO_IRQ) {
 		printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__);
 		rv = -EBUSY;
 		goto err_irq;
diff --git a/trunk/drivers/xen/xen-balloon.c b/trunk/drivers/xen/xen-balloon.c
index 596e6a7b17d6..3832e303c33a 100644
--- a/trunk/drivers/xen/xen-balloon.c
+++ b/trunk/drivers/xen/xen-balloon.c
@@ -221,7 +221,7 @@ static int register_balloon(struct device *dev)
 {
 	int i, error;
 
-	error = subsys_system_register(&balloon_subsys, NULL);
+	error = bus_register(&balloon_subsys);
 	if (error)
 		return error;
 
diff --git a/trunk/fs/btrfs/Kconfig b/trunk/fs/btrfs/Kconfig
index d33f01c08b60..ecb9fd3be143 100644
--- a/trunk/fs/btrfs/Kconfig
+++ b/trunk/fs/btrfs/Kconfig
@@ -31,22 +31,3 @@ config BTRFS_FS_POSIX_ACL
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
-
-config BTRFS_FS_CHECK_INTEGRITY
-	bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
-	depends on BTRFS_FS
-	help
-	  Adds code that examines all block write requests (including
-	  writes of the super block). The goal is to verify that the
-	  state of the filesystem on disk is always consistent, i.e.,
-	  after a power-loss or kernel panic event the filesystem is
-	  in a consistent state.
-
-	  If the integrity check tool is included and activated in
-	  the mount options, plenty of kernel memory is used, and
-	  plenty of additional CPU cycles are spent. Enabling this
-	  functionality is not intended for normal use.
-
-	  In most cases, unless you are a btrfs developer who needs
-	  to verify the integrity of (super)-block write requests
-	  during the run of a regression test, say N
diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile
index 0c4fa2befae7..c0ddfd29c5e5 100644
--- a/trunk/fs/btrfs/Makefile
+++ b/trunk/fs/btrfs/Makefile
@@ -8,7 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o ulist.o
+	   reada.o backref.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/trunk/fs/btrfs/backref.c b/trunk/fs/btrfs/backref.c
index b9a843226de8..22c64fff1bd5 100644
--- a/trunk/fs/btrfs/backref.c
+++ b/trunk/fs/btrfs/backref.c
@@ -19,789 +19,18 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
-#include "ulist.h"
-#include "transaction.h"
-#include "delayed-ref.h"
 
-/*
- * this structure records all encountered refs on the way up to the root
- */
-struct __prelim_ref {
+struct __data_ref {
 	struct list_head list;
-	u64 root_id;
-	struct btrfs_key key;
-	int level;
-	int count;
-	u64 parent;
-	u64 wanted_disk_byte;
+	u64 inum;
+	u64 root;
+	u64 extent_data_item_offset;
 };
 
-static int __add_prelim_ref(struct list_head *head, u64 root_id,
-			    struct btrfs_key *key, int level, u64 parent,
-			    u64 wanted_disk_byte, int count)
-{
-	struct __prelim_ref *ref;
-
-	/* in case we're adding delayed refs, we're holding the refs spinlock */
-	ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->root_id = root_id;
-	if (key)
-		ref->key = *key;
-	else
-		memset(&ref->key, 0, sizeof(ref->key));
-
-	ref->level = level;
-	ref->count = count;
-	ref->parent = parent;
-	ref->wanted_disk_byte = wanted_disk_byte;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-				struct ulist *parents,
-				struct extent_buffer *eb, int level,
-				u64 wanted_objectid, u64 wanted_disk_byte)
-{
-	int ret;
-	int slot;
-	struct btrfs_file_extent_item *fi;
-	struct btrfs_key key;
+struct __shared_ref {
+	struct list_head list;
 	u64 disk_byte;
-
-add_parent:
-	ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
-	if (ret < 0)
-		return ret;
-
-	if (level != 0)
-		return 0;
-
-	/*
-	 * if the current leaf is full with EXTENT_DATA items, we must
-	 * check the next one if that holds a reference as well.
-	 * ref->count cannot be used to skip this check.
-	 * repeat this until we don't find any additional EXTENT_DATA items.
-	 */
-	while (1) {
-		ret = btrfs_next_leaf(root, path);
-		if (ret < 0)
-			return ret;
-		if (ret)
-			return 0;
-
-		eb = path->nodes[0];
-		for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
-			btrfs_item_key_to_cpu(eb, &key, slot);
-			if (key.objectid != wanted_objectid ||
-			    key.type != BTRFS_EXTENT_DATA_KEY)
-				return 0;
-			fi = btrfs_item_ptr(eb, slot,
-						struct btrfs_file_extent_item);
-			disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-			if (disk_byte == wanted_disk_byte)
-				goto add_parent;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * resolve an indirect backref in the form (root_id, key, level)
- * to a logical address
- */
-static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
-					struct __prelim_ref *ref,
-					struct ulist *parents)
-{
-	struct btrfs_path *path;
-	struct btrfs_root *root;
-	struct btrfs_key root_key;
-	struct btrfs_key key = {0};
-	struct extent_buffer *eb;
-	int ret = 0;
-	int root_level;
-	int level = ref->level;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	root_key.objectid = ref->root_id;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	root_key.offset = (u64)-1;
-	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
-	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
-	}
-
-	rcu_read_lock();
-	root_level = btrfs_header_level(root->node);
-	rcu_read_unlock();
-
-	if (root_level + 1 == level)
-		goto out;
-
-	path->lowest_level = level;
-	ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
-	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
-		 "%d for key (%llu %u %llu)\n",
-		 (unsigned long long)ref->root_id, level, ref->count, ret,
-		 (unsigned long long)ref->key.objectid, ref->key.type,
-		 (unsigned long long)ref->key.offset);
-	if (ret < 0)
-		goto out;
-
-	eb = path->nodes[level];
-	if (!eb) {
-		WARN_ON(1);
-		ret = 1;
-		goto out;
-	}
-
-	if (level == 0) {
-		if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				goto out;
-			eb = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-	}
-
-	/* the last two parameters will only be used for level == 0 */
-	ret = add_all_parents(root, path, parents, eb, level, key.objectid,
-				ref->wanted_disk_byte);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
- * resolve all indirect backrefs from the list
- */
-static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-				   struct list_head *head)
-{
-	int err;
-	int ret = 0;
-	struct __prelim_ref *ref;
-	struct __prelim_ref *ref_safe;
-	struct __prelim_ref *new_ref;
-	struct ulist *parents;
-	struct ulist_node *node;
-
-	parents = ulist_alloc(GFP_NOFS);
-	if (!parents)
-		return -ENOMEM;
-
-	/*
-	 * _safe allows us to insert directly after the current item without
-	 * iterating over the newly inserted items.
-	 * we're also allowed to re-assign ref during iteration.
-	 */
-	list_for_each_entry_safe(ref, ref_safe, head, list) {
-		if (ref->parent)	/* already direct */
-			continue;
-		if (ref->count == 0)
-			continue;
-		err = __resolve_indirect_ref(fs_info, ref, parents);
-		if (err) {
-			if (ret == 0)
-				ret = err;
-			continue;
-		}
-
-		/* we put the first parent into the ref at hand */
-		node = ulist_next(parents, NULL);
-		ref->parent = node ? node->val : 0;
-
-		/* additional parents require new refs being added here */
-		while ((node = ulist_next(parents, node))) {
-			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
-			if (!new_ref) {
-				ret = -ENOMEM;
-				break;
-			}
-			memcpy(new_ref, ref, sizeof(*ref));
-			new_ref->parent = node->val;
-			list_add(&new_ref->list, &ref->list);
-		}
-		ulist_reinit(parents);
-	}
-
-	ulist_free(parents);
-	return ret;
-}
-
-/*
- * merge two lists of backrefs and adjust counts accordingly
- *
- * mode = 1: merge identical keys, if key is set
- * mode = 2: merge identical parents
- */
-static int __merge_refs(struct list_head *head, int mode)
-{
-	struct list_head *pos1;
-
-	list_for_each(pos1, head) {
-		struct list_head *n2;
-		struct list_head *pos2;
-		struct __prelim_ref *ref1;
-
-		ref1 = list_entry(pos1, struct __prelim_ref, list);
-
-		if (mode == 1 && ref1->key.type == 0)
-			continue;
-		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
-		     pos2 = n2, n2 = pos2->next) {
-			struct __prelim_ref *ref2;
-
-			ref2 = list_entry(pos2, struct __prelim_ref, list);
-
-			if (mode == 1) {
-				if (memcmp(&ref1->key, &ref2->key,
-					   sizeof(ref1->key)) ||
-				    ref1->level != ref2->level ||
-				    ref1->root_id != ref2->root_id)
-					continue;
-				ref1->count += ref2->count;
-			} else {
-				if (ref1->parent != ref2->parent)
-					continue;
-				ref1->count += ref2->count;
-			}
-			list_del(&ref2->list);
-			kfree(ref2);
-		}
-
-	}
-	return 0;
-}
-
-/*
- * add all currently queued delayed refs from this head whose seq nr is
- * smaller or equal that seq to the list
- */
-static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			      struct btrfs_key *info_key,
-			      struct list_head *prefs)
-{
-	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
-	struct rb_node *n = &head->node.rb_node;
-	int sgn;
-	int ret;
-
-	if (extent_op && extent_op->update_key)
-		btrfs_disk_key_to_cpu(info_key, &extent_op->key);
-
-	while ((n = rb_prev(n))) {
-		struct btrfs_delayed_ref_node *node;
-		node = rb_entry(n, struct btrfs_delayed_ref_node,
-				rb_node);
-		if (node->bytenr != head->node.bytenr)
-			break;
-		WARN_ON(node->is_head);
-
-		if (node->seq > seq)
-			continue;
-
-		switch (node->action) {
-		case BTRFS_ADD_DELAYED_EXTENT:
-		case BTRFS_UPDATE_DELAYED_HEAD:
-			WARN_ON(1);
-			continue;
-		case BTRFS_ADD_DELAYED_REF:
-			sgn = 1;
-			break;
-		case BTRFS_DROP_DELAYED_REF:
-			sgn = -1;
-			break;
-		default:
-			BUG_ON(1);
-		}
-		switch (node->type) {
-		case BTRFS_TREE_BLOCK_REF_KEY: {
-			struct btrfs_delayed_tree_ref *ref;
-
-			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = __add_prelim_ref(prefs, ref->root, info_key,
-					       ref->level + 1, 0, node->bytenr,
-					       node->ref_mod * sgn);
-			break;
-		}
-		case BTRFS_SHARED_BLOCK_REF_KEY: {
-			struct btrfs_delayed_tree_ref *ref;
-
-			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = __add_prelim_ref(prefs, ref->root, info_key,
-					       ref->level + 1, ref->parent,
-					       node->bytenr,
-					       node->ref_mod * sgn);
-			break;
-		}
-		case BTRFS_EXTENT_DATA_REF_KEY: {
-			struct btrfs_delayed_data_ref *ref;
-			struct btrfs_key key;
-
-			ref = btrfs_delayed_node_to_data_ref(node);
-
-			key.objectid = ref->objectid;
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = ref->offset;
-			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
-					       node->bytenr,
-					       node->ref_mod * sgn);
-			break;
-		}
-		case BTRFS_SHARED_DATA_REF_KEY: {
-			struct btrfs_delayed_data_ref *ref;
-			struct btrfs_key key;
-
-			ref = btrfs_delayed_node_to_data_ref(node);
-
-			key.objectid = ref->objectid;
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = ref->offset;
-			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
-					       ref->parent, node->bytenr,
-					       node->ref_mod * sgn);
-			break;
-		}
-		default:
-			WARN_ON(1);
-		}
-		BUG_ON(ret);
-	}
-
-	return 0;
-}
-
-/*
- * add all inline backrefs for bytenr to the list
- */
-static int __add_inline_refs(struct btrfs_fs_info *fs_info,
-			     struct btrfs_path *path, u64 bytenr,
-			     struct btrfs_key *info_key, int *info_level,
-			     struct list_head *prefs)
-{
-	int ret;
-	int slot;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-	unsigned long ptr;
-	unsigned long end;
-	struct btrfs_extent_item *ei;
-	u64 flags;
-	u64 item_size;
-
-	/*
-	 * enumerate all inline refs
-	 */
-	leaf = path->nodes[0];
-	slot = path->slots[0] - 1;
-
-	item_size = btrfs_item_size_nr(leaf, slot);
-	BUG_ON(item_size < sizeof(*ei));
-
-	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
-	flags = btrfs_extent_flags(leaf, ei);
-
-	ptr = (unsigned long)(ei + 1);
-	end = (unsigned long)ei + item_size;
-
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		struct btrfs_tree_block_info *info;
-		struct btrfs_disk_key disk_key;
-
-		info = (struct btrfs_tree_block_info *)ptr;
-		*info_level = btrfs_tree_block_level(leaf, info);
-		btrfs_tree_block_key(leaf, info, &disk_key);
-		btrfs_disk_key_to_cpu(info_key, &disk_key);
-		ptr += sizeof(struct btrfs_tree_block_info);
-		BUG_ON(ptr > end);
-	} else {
-		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
-	}
-
-	while (ptr < end) {
-		struct btrfs_extent_inline_ref *iref;
-		u64 offset;
-		int type;
-
-		iref = (struct btrfs_extent_inline_ref *)ptr;
-		type = btrfs_extent_inline_ref_type(leaf, iref);
-		offset = btrfs_extent_inline_ref_offset(leaf, iref);
-
-		switch (type) {
-		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, 0, info_key,
-						*info_level + 1, offset,
-						bytenr, 1);
-			break;
-		case BTRFS_SHARED_DATA_REF_KEY: {
-			struct btrfs_shared_data_ref *sdref;
-			int count;
-
-			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
-			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
-					       bytenr, count);
-			break;
-		}
-		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, offset, info_key,
-					       *info_level + 1, 0, bytenr, 1);
-			break;
-		case BTRFS_EXTENT_DATA_REF_KEY: {
-			struct btrfs_extent_data_ref *dref;
-			int count;
-			u64 root;
-
-			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
-			count = btrfs_extent_data_ref_count(leaf, dref);
-			key.objectid = btrfs_extent_data_ref_objectid(leaf,
-								      dref);
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
-			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
-						count);
-			break;
-		}
-		default:
-			WARN_ON(1);
-		}
-		BUG_ON(ret);
-		ptr += btrfs_extent_inline_ref_size(type);
-	}
-
-	return 0;
-}
-
-/*
- * add all non-inline backrefs for bytenr to the list
- */
-static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
-			    struct btrfs_path *path, u64 bytenr,
-			    struct btrfs_key *info_key, int info_level,
-			    struct list_head *prefs)
-{
-	struct btrfs_root *extent_root = fs_info->extent_root;
-	int ret;
-	int slot;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-
-	while (1) {
-		ret = btrfs_next_item(extent_root, path);
-		if (ret < 0)
-			break;
-		if (ret) {
-			ret = 0;
-			break;
-		}
-
-		slot = path->slots[0];
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-
-		if (key.objectid != bytenr)
-			break;
-		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
-			continue;
-		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
-			break;
-
-		switch (key.type) {
-		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, 0, info_key,
-						info_level + 1, key.offset,
-						bytenr, 1);
-			break;
-		case BTRFS_SHARED_DATA_REF_KEY: {
-			struct btrfs_shared_data_ref *sdref;
-			int count;
-
-			sdref = btrfs_item_ptr(leaf, slot,
-					      struct btrfs_shared_data_ref);
-			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
-						bytenr, count);
-			break;
-		}
-		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, key.offset, info_key,
-						info_level + 1, 0, bytenr, 1);
-			break;
-		case BTRFS_EXTENT_DATA_REF_KEY: {
-			struct btrfs_extent_data_ref *dref;
-			int count;
-			u64 root;
-
-			dref = btrfs_item_ptr(leaf, slot,
-					      struct btrfs_extent_data_ref);
-			count = btrfs_extent_data_ref_count(leaf, dref);
-			key.objectid = btrfs_extent_data_ref_objectid(leaf,
-								      dref);
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
-			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-						bytenr, count);
-			break;
-		}
-		default:
-			WARN_ON(1);
-		}
-		BUG_ON(ret);
-	}
-
-	return ret;
-}
-
-/*
- * this adds all existing backrefs (inline backrefs, backrefs and delayed
- * refs) for the given bytenr to the refs list, merges duplicates and resolves
- * indirect refs to their parent bytenr.
- * When roots are found, they're added to the roots list
- *
- * FIXME some caching might speed things up
- */
-static int find_parent_nodes(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 seq, struct ulist *refs, struct ulist *roots)
-{
-	struct btrfs_key key;
-	struct btrfs_path *path;
-	struct btrfs_key info_key = { 0 };
-	struct btrfs_delayed_ref_root *delayed_refs = NULL;
-	struct btrfs_delayed_ref_head *head = NULL;
-	int info_level = 0;
-	int ret;
-	struct list_head prefs_delayed;
-	struct list_head prefs;
-	struct __prelim_ref *ref;
-
-	INIT_LIST_HEAD(&prefs);
-	INIT_LIST_HEAD(&prefs_delayed);
-
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = (u64)-1;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/*
-	 * grab both a lock on the path and a lock on the delayed ref head.
-	 * We need both to get a consistent picture of how the refs look
-	 * at a specified point in time
-	 */
-again:
-	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-	BUG_ON(ret == 0);
-
-	/*
-	 * look if there are updates for this ref queued and lock the head
-	 */
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(trans, bytenr);
-	if (head) {
-		if (!mutex_trylock(&head->mutex)) {
-			atomic_inc(&head->node.refs);
-			spin_unlock(&delayed_refs->lock);
-
-			btrfs_release_path(path);
-
-			/*
-			 * Mutex was contended, block until it's
-			 * released and try again
-			 */
-			mutex_lock(&head->mutex);
-			mutex_unlock(&head->mutex);
-			btrfs_put_delayed_ref(&head->node);
-			goto again;
-		}
-		ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
-		if (ret)
-			goto out;
-	}
-	spin_unlock(&delayed_refs->lock);
-
-	if (path->slots[0]) {
-		struct extent_buffer *leaf;
-		int slot;
-
-		leaf = path->nodes[0];
-		slot = path->slots[0] - 1;
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid == bytenr &&
-		    key.type == BTRFS_EXTENT_ITEM_KEY) {
-			ret = __add_inline_refs(fs_info, path, bytenr,
-						&info_key, &info_level, &prefs);
-			if (ret)
-				goto out;
-			ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
-					       info_level, &prefs);
-			if (ret)
-				goto out;
-		}
-	}
-	btrfs_release_path(path);
-
-	/*
-	 * when adding the delayed refs above, the info_key might not have
-	 * been known yet. Go over the list and replace the missing keys
-	 */
-	list_for_each_entry(ref, &prefs_delayed, list) {
-		if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
-			memcpy(&ref->key, &info_key, sizeof(ref->key));
-	}
-	list_splice_init(&prefs_delayed, &prefs);
-
-	ret = __merge_refs(&prefs, 1);
-	if (ret)
-		goto out;
-
-	ret = __resolve_indirect_refs(fs_info, &prefs);
-	if (ret)
-		goto out;
-
-	ret = __merge_refs(&prefs, 2);
-	if (ret)
-		goto out;
-
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct __prelim_ref, list);
-		list_del(&ref->list);
-		if (ref->count < 0)
-			WARN_ON(1);
-		if (ref->count && ref->root_id && ref->parent == 0) {
-			/* no parent == root of tree */
-			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
-			BUG_ON(ret < 0);
-		}
-		if (ref->count && ref->parent) {
-			ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
-			BUG_ON(ret < 0);
-		}
-		kfree(ref);
-	}
-
-out:
-	if (head)
-		mutex_unlock(&head->mutex);
-	btrfs_free_path(path);
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct __prelim_ref, list);
-		list_del(&ref->list);
-		kfree(ref);
-	}
-	while (!list_empty(&prefs_delayed)) {
-		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
-				       list);
-		list_del(&ref->list);
-		kfree(ref);
-	}
-
-	return ret;
-}
-
-/*
- * Finds all leafs with a reference to the specified combination of bytenr and
- * offset. key_list_head will point to a list of corresponding keys (caller must
- * free each list element). The leafs will be stored in the leafs ulist, which
- * must be freed with ulist_free.
- *
- * returns 0 on success, <0 on error
- */
-static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info, u64 bytenr,
-				u64 num_bytes, u64 seq, struct ulist **leafs)
-{
-	struct ulist *tmp;
-	int ret;
-
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
-	*leafs = ulist_alloc(GFP_NOFS);
-	if (!*leafs) {
-		ulist_free(tmp);
-		return -ENOMEM;
-	}
-
-	ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
-	ulist_free(tmp);
-
-	if (ret < 0 && ret != -ENOENT) {
-		ulist_free(*leafs);
-		return ret;
-	}
-
-	return 0;
-}
-
-/*
- * walk all backrefs for a given extent to find all roots that reference this
- * extent. Walking a backref means finding all extents that reference this
- * extent and in turn walk the backrefs of those, too. Naturally this is a
- * recursive process, but here it is implemented in an iterative fashion: We
- * find all referencing extents for the extent in question and put them on a
- * list. In turn, we find all referencing extents for those, further appending
- * to the list. The way we iterate the list allows adding more elements after
- * the current while iterating. The process stops when we reach the end of the
- * list. Found roots are added to the roots list.
- *
- * returns 0 on success, < 0 on error.
- */
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info, u64 bytenr,
-				u64 num_bytes, u64 seq, struct ulist **roots)
-{
-	struct ulist *tmp;
-	struct ulist_node *node = NULL;
-	int ret;
-
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
-	*roots = ulist_alloc(GFP_NOFS);
-	if (!*roots) {
-		ulist_free(tmp);
-		return -ENOMEM;
-	}
-
-	while (1) {
-		ret = find_parent_nodes(trans, fs_info, bytenr, seq,
-					tmp, *roots);
-		if (ret < 0 && ret != -ENOENT) {
-			ulist_free(tmp);
-			ulist_free(*roots);
-			return ret;
-		}
-		node = ulist_next(tmp, node);
-		if (!node)
-			break;
-		bytenr = node->val;
-	}
-
-	ulist_free(tmp);
-	return 0;
-}
-
+};
 
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
 			struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -952,11 +181,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
 	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
 	    found_key->objectid > logical ||
-	    found_key->objectid + found_key->offset <= logical) {
-		pr_debug("logical %llu is not within any extent\n",
-			 (unsigned long long)logical);
+	    found_key->objectid + found_key->offset <= logical)
 		return -ENOENT;
-	}
 
 	eb = path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -965,13 +191,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(eb, ei);
 
-	pr_debug("logical %llu is at position %llu within the extent (%llu "
-		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
-		 (unsigned long long)logical,
-		 (unsigned long long)(logical - found_key->objectid),
-		 (unsigned long long)found_key->objectid,
-		 (unsigned long long)found_key->offset,
-		 (unsigned long long)flags, item_size);
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
 	if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -1068,11 +287,128 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 	return 0;
 }
 
-static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
-				struct btrfs_path *path, u64 logical,
-				u64 orig_extent_item_objectid,
-				u64 extent_item_pos, u64 root,
-				iterate_extent_inodes_t *iterate, void *ctx)
+static int __data_list_add(struct list_head *head, u64 inum,
+				u64 extent_data_item_offset, u64 root)
+{
+	struct __data_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->inum = inum;
+	ref->extent_data_item_offset = extent_data_item_offset;
+	ref->root = root;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
+				struct btrfs_extent_data_ref *dref)
+{
+	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
+				btrfs_extent_data_ref_offset(eb, dref),
+				btrfs_extent_data_ref_root(eb, dref));
+}
+
+static int __shared_list_add(struct list_head *head, u64 disk_byte)
+{
+	struct __shared_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->disk_byte = disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
+					   u64 logical, u64 inum,
+					   u64 extent_data_item_offset,
+					   u64 extent_offset,
+					   struct btrfs_path *path,
+					   struct list_head *data_refs,
+					   iterate_extent_inodes_t *iterate,
+					   void *ctx)
+{
+	u64 ref_root;
+	u32 item_size;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *eiref;
+	struct __data_ref *ref;
+	int ret;
+	int type;
+	int last;
+	unsigned long ptr = 0;
+
+	WARN_ON(!list_empty(data_refs));
+	ret = extent_from_logical(fs_info, logical, path, &key);
+	if (ret & BTRFS_EXTENT_FLAG_DATA)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	ret = 0;
+	ref_root = 0;
+	/*
+	 * as done in iterate_extent_inodes, we first build a list of refs to
+	 * iterate, then free the path and then iterate them to avoid deadlocks.
+	 */
+	do {
+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (last < 0) {
+			ret = last;
+			goto out;
+		}
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
+			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
+			ret = __data_list_add(data_refs, inum,
+						extent_data_item_offset,
+						ref_root);
+		}
+	} while (!ret && !last);
+
+	btrfs_release_path(path);
+
+	if (ref_root == 0) {
+		printk(KERN_ERR "btrfs: failed to find tree block ref "
+			"for shared data backref %llu\n", logical);
+		WARN_ON(1);
+		ret = -EIO;
+	}
+
+out:
+	while (!list_empty(data_refs)) {
+		ref = list_first_entry(data_refs, struct __data_ref, list);
+		list_del(&ref->list);
+		if (!ret)
+			ret = iterate(ref->inum, extent_offset +
+					ref->extent_data_item_offset,
+					ref->root, ctx);
+		kfree(ref);
+	}
+
+	return ret;
+}
+
+static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
+				    u64 logical, u64 orig_extent_item_objectid,
+				    u64 extent_offset, struct btrfs_path *path,
+				    struct list_head *data_refs,
+				    iterate_extent_inodes_t *iterate,
+				    void *ctx)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
@@ -1080,10 +416,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
 	struct extent_buffer *eb;
 	int slot;
 	int nritems;
-	int ret = 0;
-	int extent_type;
-	u64 data_offset;
-	u64 data_len;
+	int ret;
+	int found = 0;
 
 	eb = read_tree_block(fs_info->tree_root, logical,
 				fs_info->tree_root->leafsize, 0);
@@ -1101,99 +435,149 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
 		if (key.type != BTRFS_EXTENT_DATA_KEY)
 			continue;
 		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-		extent_type = btrfs_file_extent_type(eb, fi);
-		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
-			continue;
-		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+		if (!fi) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
 		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-		if (disk_byte != orig_extent_item_objectid)
-			continue;
-
-		data_offset = btrfs_file_extent_offset(eb, fi);
-		data_len = btrfs_file_extent_num_bytes(eb, fi);
-
-		if (extent_item_pos < data_offset ||
-		    extent_item_pos >= data_offset + data_len)
-			continue;
-
-		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
-				"root %llu\n", orig_extent_item_objectid,
-				key.objectid, key.offset, root);
-		ret = iterate(key.objectid,
-				key.offset + (extent_item_pos - data_offset),
-				root, ctx);
-		if (ret) {
-			pr_debug("stopping iteration because ret=%d\n", ret);
-			break;
+		if (disk_byte != orig_extent_item_objectid) {
+			if (found)
+				break;
+			else
+				continue;
 		}
+		++found;
+		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
+							key.objectid,
+							key.offset,
+							extent_offset, path,
+							data_refs,
+							iterate, ctx);
+		if (ret)
+			break;
 	}
 
-	free_extent_buffer(eb);
+	if (!found) {
+		printk(KERN_ERR "btrfs: failed to follow shared data backref "
+			"to parent %llu\n", logical);
+		WARN_ON(1);
+		ret = -EIO;
+	}
 
+	free_extent_buffer(eb);
 	return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters.
+ * the given parameters. will use the path given as a parameter and return it
+ * released.
  * when the iterator function returns a non-zero value, iteration stops.
- * path is guaranteed to be in released state when iterate() is called.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
-				u64 extent_item_objectid, u64 extent_item_pos,
+				u64 extent_item_objectid,
+				u64 extent_offset,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
+	unsigned long ptr = 0;
+	int last;
 	int ret;
+	int type;
+	u64 logical;
+	u32 item_size;
+	struct btrfs_extent_inline_ref *eiref;
+	struct btrfs_extent_data_ref *dref;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
 	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
 	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-	struct btrfs_trans_handle *trans;
-	struct ulist *refs;
-	struct ulist *roots;
-	struct ulist_node *ref_node = NULL;
-	struct ulist_node *root_node = NULL;
-	struct seq_list seq_elem;
-	struct btrfs_delayed_ref_root *delayed_refs;
-
-	trans = btrfs_join_transaction(fs_info->extent_root);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
-
-	pr_debug("resolving all inodes for extent %llu\n",
-			extent_item_objectid);
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-	btrfs_get_delayed_seq(delayed_refs, &seq_elem);
-	spin_unlock(&delayed_refs->lock);
-
-	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-				   extent_item_pos, seq_elem.seq,
-				   &refs);
+	struct __data_ref *ref_d;
+	struct __shared_ref *ref_s;
 
-	if (ret)
-		goto out;
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 
-	while (!ret && (ref_node = ulist_next(refs, ref_node))) {
-		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
-						seq_elem.seq, &roots);
-		if (ret)
+	/* first we iterate the inline refs, ... */
+	do {
+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (last == -ENOENT) {
+			ret = 0;
+			break;
+		}
+		if (last < 0) {
+			ret = last;
+			break;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
+			ret = __data_list_add_eb(&data_refs, eb, dref);
+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+			logical = btrfs_extent_inline_ref_offset(eb, eiref);
+			ret = __shared_list_add(&shared_refs, logical);
+		}
+	} while (!ret && !last);
+
+	/* ... then we proceed to in-tree references and ... */
+	while (!ret) {
+		++path->slots[0];
+		if (path->slots[0] > btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(fs_info->extent_root, path);
+			if (ret) {
+				if (ret == 1)
+					ret = 0; /* we're done */
+				break;
+			}
+			eb = path->nodes[0];
+		}
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_item_objectid)
 			break;
-		while (!ret && (root_node = ulist_next(roots, root_node))) {
-			pr_debug("root %llu references leaf %llu\n",
-					root_node->val, ref_node->val);
-			ret = iterate_leaf_refs(fs_info, path, ref_node->val,
-						extent_item_objectid,
-						extent_item_pos, root_node->val,
-						iterate, ctx);
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+						struct btrfs_extent_data_ref);
+			ret = __data_list_add_eb(&data_refs, eb, dref);
+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			ret = __shared_list_add(&shared_refs, key.offset);
 		}
 	}
 
-	ulist_free(refs);
-	ulist_free(roots);
-out:
-	btrfs_put_delayed_seq(delayed_refs, &seq_elem);
-	btrfs_end_transaction(trans, fs_info->extent_root);
+	btrfs_release_path(path);
+
+	/*
+	 * ... only at the very end we can process the refs we found. this is
+	 * because the iterator function we call is allowed to make tree lookups
+	 * and we have to avoid deadlocks. additionally, we need more tree
+	 * lookups ourselves for shared data refs.
+	 */
+	while (!list_empty(&data_refs)) {
+		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
+		list_del(&ref_d->list);
+		if (!ret)
+			ret = iterate(ref_d->inum, extent_offset +
+					ref_d->extent_data_item_offset,
+					ref_d->root, ctx);
+		kfree(ref_d);
+	}
+
+	while (!list_empty(&shared_refs)) {
+		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
+					list);
+		list_del(&ref_s->list);
+		if (!ret)
+			ret = __iter_shared_inline_ref(fs_info,
+							ref_s->disk_byte,
+							extent_item_objectid,
+							extent_offset, path,
+							&data_refs,
+							iterate, ctx);
+		kfree(ref_s);
+	}
+
 	return ret;
 }
 
@@ -1202,20 +586,19 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
-	u64 extent_item_pos;
+	u64 offset;
 	struct btrfs_key found_key;
 
 	ret = extent_from_logical(fs_info, logical, path,
 					&found_key);
-	btrfs_release_path(path);
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -EINVAL;
 	if (ret < 0)
 		return ret;
 
-	extent_item_pos = logical - found_key.objectid;
+	offset = logical - found_key.objectid;
 	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-					extent_item_pos, iterate, ctx);
+					offset, iterate, ctx);
 
 	return ret;
 }
@@ -1260,10 +643,6 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
-			pr_debug("following ref at offset %u for inode %llu in "
-				 "tree %llu\n", cur,
-				 (unsigned long long)found_key.objectid,
-				 (unsigned long long)fs_root->objectid);
 			ret = iterate(parent, iref, eb, ctx);
 			if (ret) {
 				free_extent_buffer(eb);
@@ -1304,14 +683,10 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
-		pr_debug("path resolved: %s\n", fspath);
 		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
-		pr_debug("missed path, not enough space. missing bytes: %lu, "
-			 "constructed so far: %s\n",
-			 (unsigned long)(fspath_min - fspath), fspath_min);
 		++ipath->fspath->elem_missed;
 		ipath->fspath->bytes_missing += fspath_min - fspath;
 		ipath->fspath->bytes_left = 0;
diff --git a/trunk/fs/btrfs/backref.h b/trunk/fs/btrfs/backref.h
index d00dfa9ca934..92618837cb8f 100644
--- a/trunk/fs/btrfs/backref.h
+++ b/trunk/fs/btrfs/backref.h
@@ -20,7 +20,6 @@
 #define __BTRFS_BACKREF__
 
 #include "ioctl.h"
-#include "ulist.h"
 
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
@@ -55,10 +54,6 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info, u64 bytenr,
-				u64 num_bytes, u64 seq, struct ulist **roots);
-
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path);
diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h
index 9b9b15fd5204..634608d2a6d0 100644
--- a/trunk/fs/btrfs/btrfs_inode.h
+++ b/trunk/fs/btrfs/btrfs_inode.h
@@ -51,9 +51,6 @@ struct btrfs_inode {
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
-	/* held while doing delalloc reservations */
-	struct mutex delalloc_mutex;
-
 	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/trunk/fs/btrfs/check-integrity.c b/trunk/fs/btrfs/check-integrity.c
deleted file mode 100644
index ad0b3ba735b7..000000000000
--- a/trunk/fs/btrfs/check-integrity.c
+++ /dev/null
@@ -1,3068 +0,0 @@
-/*
- * Copyright (C) STRATO AG 2011.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-/*
- * This module can be used to catch cases when the btrfs kernel
- * code executes write requests to the disk that bring the file
- * system in an inconsistent state. In such a state, a power-loss
- * or kernel panic event would cause that the data on disk is
- * lost or at least damaged.
- *
- * Code is added that examines all block write requests during
- * runtime (including writes of the super block). Three rules
- * are verified and an error is printed on violation of the
- * rules:
- * 1. It is not allowed to write a disk block which is
- *    currently referenced by the super block (either directly
- *    or indirectly).
- * 2. When a super block is written, it is verified that all
- *    referenced (directly or indirectly) blocks fulfill the
- *    following requirements:
- *    2a. All referenced blocks have either been present when
- *        the file system was mounted, (i.e., they have been
- *        referenced by the super block) or they have been
- *        written since then and the write completion callback
- *        was called and a FLUSH request to the device where
- *        these blocks are located was received and completed.
- *    2b. All referenced blocks need to have a generation
- *        number which is equal to the parent's number.
- *
- * One issue that was found using this module was that the log
- * tree on disk became temporarily corrupted because disk blocks
- * that had been in use for the log tree had been freed and
- * reused too early, while being referenced by the written super
- * block.
- *
- * The search term in the kernel log that can be used to filter
- * on the existence of detected integrity issues is
- * "btrfs: attempt".
- *
- * The integrity check is enabled via mount options. These
- * mount options are only supported if the integrity check
- * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
- *
- * Example #1, apply integrity checks to all metadata:
- * mount /dev/sdb1 /mnt -o check_int
- *
- * Example #2, apply integrity checks to all metadata and
- * to data extents:
- * mount /dev/sdb1 /mnt -o check_int_data
- *
- * Example #3, apply integrity checks to all metadata and dump
- * the tree that the super block references to kernel messages
- * each time after a super block was written:
- * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
- *
- * If the integrity check tool is included and activated in
- * the mount options, plenty of kernel memory is used, and
- * plenty of additional CPU cycles are spent. Enabling this
- * functionality is not intended for normal use. In most
- * cases, unless you are a btrfs developer who needs to verify
- * the integrity of (super)-block write requests, do not
- * enable the config option BTRFS_FS_CHECK_INTEGRITY to
- * include and compile the integrity check tool.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/buffer_head.h>
-#include <linux/mutex.h>
-#include <linux/crc32c.h>
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "extent_io.h"
-#include "disk-io.h"
-#include "volumes.h"
-#include "print-tree.h"
-#include "locking.h"
-#include "check-integrity.h"
-
-#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
-#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
-#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
-#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
-#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
-#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
-							 * excluding " [...]" */
-#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
-
-#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
-
-/*
- * The definition of the bitmask fields for the print_mask.
- * They are specified with the mount option check_integrity_print_mask.
- */
-#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
-#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
-#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
-#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
-#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
-#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
-#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
-#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
-#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
-#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
-#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
-#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
-
-struct btrfsic_dev_state;
-struct btrfsic_state;
-
-struct btrfsic_block {
-	u32 magic_num;		/* only used for debug purposes */
-	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
-	unsigned int is_superblock:1;	/* if it is one of the superblocks */
-	unsigned int is_iodone:1;	/* if is done by lower subsystem */
-	unsigned int iodone_w_error:1;	/* error was indicated to endio */
-	unsigned int never_written:1;	/* block was added because it was
-					 * referenced, not because it was
-					 * written */
-	unsigned int mirror_num:2;	/* large enough to hold
-					 * BTRFS_SUPER_MIRROR_MAX */
-	struct btrfsic_dev_state *dev_state;
-	u64 dev_bytenr;		/* key, physical byte num on disk */
-	u64 logical_bytenr;	/* logical byte num on disk */
-	u64 generation;
-	struct btrfs_disk_key disk_key;	/* extra info to print in case of
-					 * issues, will not always be correct */
-	struct list_head collision_resolving_node;	/* list node */
-	struct list_head all_blocks_node;	/* list node */
-
-	/* the following two lists contain block_link items */
-	struct list_head ref_to_list;	/* list */
-	struct list_head ref_from_list;	/* list */
-	struct btrfsic_block *next_in_same_bio;
-	void *orig_bio_bh_private;
-	union {
-		bio_end_io_t *bio;
-		bh_end_io_t *bh;
-	} orig_bio_bh_end_io;
-	int submit_bio_bh_rw;
-	u64 flush_gen; /* only valid if !never_written */
-};
-
-/*
- * Elements of this type are allocated dynamically and required because
- * each block object can refer to and can be ref from multiple blocks.
- * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block refered from.
- * The fact that they are searchable via a hashtable and that a
- * ref_cnt is maintained is not required for the btrfs integrity
- * check algorithm itself, it is only used to make the output more
- * beautiful in case that an error is detected (an error is defined
- * as a write operation to a block while that block is still referenced).
- */
-struct btrfsic_block_link {
-	u32 magic_num;		/* only used for debug purposes */
-	u32 ref_cnt;
-	struct list_head node_ref_to;	/* list node */
-	struct list_head node_ref_from;	/* list node */
-	struct list_head collision_resolving_node;	/* list node */
-	struct btrfsic_block *block_ref_to;
-	struct btrfsic_block *block_ref_from;
-	u64 parent_generation;
-};
-
-struct btrfsic_dev_state {
-	u32 magic_num;		/* only used for debug purposes */
-	struct block_device *bdev;
-	struct btrfsic_state *state;
-	struct list_head collision_resolving_node;	/* list node */
-	struct btrfsic_block dummy_block_for_bio_bh_flush;
-	u64 last_flush_gen;
-	char name[BDEVNAME_SIZE];
-};
-
-struct btrfsic_block_hashtable {
-	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_link_hashtable {
-	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_dev_state_hashtable {
-	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_data_ctx {
-	u64 start;		/* virtual bytenr */
-	u64 dev_bytenr;		/* physical bytenr on device */
-	u32 len;
-	struct btrfsic_dev_state *dev;
-	char *data;
-	struct buffer_head *bh;	/* do not use if set to NULL */
-};
-
-/* This structure is used to implement recursion without occupying
- * any stack space, refer to btrfsic_process_metablock() */
-struct btrfsic_stack_frame {
-	u32 magic;
-	u32 nr;
-	int error;
-	int i;
-	int limit_nesting;
-	int num_copies;
-	int mirror_num;
-	struct btrfsic_block *block;
-	struct btrfsic_block_data_ctx *block_ctx;
-	struct btrfsic_block *next_block;
-	struct btrfsic_block_data_ctx next_block_ctx;
-	struct btrfs_header *hdr;
-	struct btrfsic_stack_frame *prev;
-};
-
-/* Some state per mounted filesystem */
-struct btrfsic_state {
-	u32 print_mask;
-	int include_extent_data;
-	int csum_size;
-	struct list_head all_blocks_list;
-	struct btrfsic_block_hashtable block_hashtable;
-	struct btrfsic_block_link_hashtable block_link_hashtable;
-	struct btrfs_root *root;
-	u64 max_superblock_generation;
-	struct btrfsic_block *latest_superblock;
-};
-
-static void btrfsic_block_init(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_alloc(void);
-static void btrfsic_block_free(struct btrfsic_block *b);
-static void btrfsic_block_link_init(struct btrfsic_block_link *n);
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
-static void btrfsic_block_link_free(struct btrfsic_block_link *n);
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
-					struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
-		struct block_device *bdev,
-		u64 dev_bytenr,
-		struct btrfsic_block_hashtable *h);
-static void btrfsic_block_link_hashtable_init(
-		struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_add(
-		struct btrfsic_block_link *l,
-		struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
-		struct block_device *bdev_ref_to,
-		u64 dev_bytenr_ref_to,
-		struct block_device *bdev_ref_from,
-		u64 dev_bytenr_ref_from,
-		struct btrfsic_block_link_hashtable *h);
-static void btrfsic_dev_state_hashtable_init(
-		struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_add(
-		struct btrfsic_dev_state *ds,
-		struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
-		struct block_device *bdev,
-		struct btrfsic_dev_state_hashtable *h);
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
-static int btrfsic_process_superblock(struct btrfsic_state *state,
-				      struct btrfs_fs_devices *fs_devices);
-static int btrfsic_process_metablock(struct btrfsic_state *state,
-				     struct btrfsic_block *block,
-				     struct btrfsic_block_data_ctx *block_ctx,
-				     struct btrfs_header *hdr,
-				     int limit_nesting, int force_iodone_flag);
-static int btrfsic_create_link_to_next_block(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx
-		*block_ctx, u64 next_bytenr,
-		int limit_nesting,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block **next_blockp,
-		int force_iodone_flag,
-		int *num_copiesp, int *mirror_nump,
-		struct btrfs_disk_key *disk_key,
-		u64 parent_generation);
-static int btrfsic_handle_extent_data(struct btrfsic_state *state,
-				      struct btrfsic_block *block,
-				      struct btrfsic_block_data_ctx *block_ctx,
-				      u32 item_offset, int force_iodone_flag);
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
-			     struct btrfsic_block_data_ctx *block_ctx_out,
-			     int mirror_num);
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
-				  u32 len, struct block_device *bdev,
-				  struct btrfsic_block_data_ctx *block_ctx_out);
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_read_block(struct btrfsic_state *state,
-			      struct btrfsic_block_data_ctx *block_ctx);
-static void btrfsic_dump_database(struct btrfsic_state *state);
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-				     const u8 *data, unsigned int size);
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-					  u64 dev_bytenr, u8 *mapped_data,
-					  unsigned int len, struct bio *bio,
-					  int *bio_is_patched,
-					  struct buffer_head *bh,
-					  int submit_bio_bh_rw);
-static int btrfsic_process_written_superblock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const block,
-		struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
-static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
-					      const struct btrfsic_block *block,
-					      int recursion_level);
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
-					struct btrfsic_block *const block,
-					int recursion_level);
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l);
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l);
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
-				   const struct btrfsic_block *block);
-static void btrfsic_dump_tree(const struct btrfsic_state *state);
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
-				  const struct btrfsic_block *block,
-				  int indent_level);
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block *next_block,
-		struct btrfsic_block *from_block,
-		u64 parent_generation);
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *block_ctx,
-		const char *additional_string,
-		int is_metadata,
-		int is_iodone,
-		int never_written,
-		int mirror_num,
-		int *was_created);
-static int btrfsic_process_superblock_dev_mirror(
-		struct btrfsic_state *state,
-		struct btrfsic_dev_state *dev_state,
-		struct btrfs_device *device,
-		int superblock_mirror_num,
-		struct btrfsic_dev_state **selected_dev_state,
-		struct btrfs_super_block *selected_super);
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
-		struct block_device *bdev);
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
-					   u64 bytenr,
-					   struct btrfsic_dev_state *dev_state,
-					   u64 dev_bytenr, char *data);
-
-static struct mutex btrfsic_mutex;
-static int btrfsic_is_initialized;
-static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
-
-
-static void btrfsic_block_init(struct btrfsic_block *b)
-{
-	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
-	b->dev_state = NULL;
-	b->dev_bytenr = 0;
-	b->logical_bytenr = 0;
-	b->generation = BTRFSIC_GENERATION_UNKNOWN;
-	b->disk_key.objectid = 0;
-	b->disk_key.type = 0;
-	b->disk_key.offset = 0;
-	b->is_metadata = 0;
-	b->is_superblock = 0;
-	b->is_iodone = 0;
-	b->iodone_w_error = 0;
-	b->never_written = 0;
-	b->mirror_num = 0;
-	b->next_in_same_bio = NULL;
-	b->orig_bio_bh_private = NULL;
-	b->orig_bio_bh_end_io.bio = NULL;
-	INIT_LIST_HEAD(&b->collision_resolving_node);
-	INIT_LIST_HEAD(&b->all_blocks_node);
-	INIT_LIST_HEAD(&b->ref_to_list);
-	INIT_LIST_HEAD(&b->ref_from_list);
-	b->submit_bio_bh_rw = 0;
-	b->flush_gen = 0;
-}
-
-static struct btrfsic_block *btrfsic_block_alloc(void)
-{
-	struct btrfsic_block *b;
-
-	b = kzalloc(sizeof(*b), GFP_NOFS);
-	if (NULL != b)
-		btrfsic_block_init(b);
-
-	return b;
-}
-
-static void btrfsic_block_free(struct btrfsic_block *b)
-{
-	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
-	kfree(b);
-}
-
-static void btrfsic_block_link_init(struct btrfsic_block_link *l)
-{
-	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
-	l->ref_cnt = 1;
-	INIT_LIST_HEAD(&l->node_ref_to);
-	INIT_LIST_HEAD(&l->node_ref_from);
-	INIT_LIST_HEAD(&l->collision_resolving_node);
-	l->block_ref_to = NULL;
-	l->block_ref_from = NULL;
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
-{
-	struct btrfsic_block_link *l;
-
-	l = kzalloc(sizeof(*l), GFP_NOFS);
-	if (NULL != l)
-		btrfsic_block_link_init(l);
-
-	return l;
-}
-
-static void btrfsic_block_link_free(struct btrfsic_block_link *l)
-{
-	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
-	kfree(l);
-}
-
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
-{
-	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
-	ds->bdev = NULL;
-	ds->state = NULL;
-	ds->name[0] = '\0';
-	INIT_LIST_HEAD(&ds->collision_resolving_node);
-	ds->last_flush_gen = 0;
-	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
-	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
-	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
-{
-	struct btrfsic_dev_state *ds;
-
-	ds = kzalloc(sizeof(*ds), GFP_NOFS);
-	if (NULL != ds)
-		btrfsic_dev_state_init(ds);
-
-	return ds;
-}
-
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
-{
-	BUG_ON(!(NULL == ds ||
-		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
-	kfree(ds);
-}
-
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
-					struct btrfsic_block_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(b->dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
-	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-
-	list_add(&b->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
-{
-	list_del(&b->collision_resolving_node);
-}
-
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
-		struct block_device *bdev,
-		u64 dev_bytenr,
-		struct btrfsic_block_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)bdev))) &
-	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-	struct list_head *elem;
-
-	list_for_each(elem, h->table + hashval) {
-		struct btrfsic_block *const b =
-		    list_entry(elem, struct btrfsic_block,
-			       collision_resolving_node);
-
-		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
-			return b;
-	}
-
-	return NULL;
-}
-
-static void btrfsic_block_link_hashtable_init(
-		struct btrfsic_block_link_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_link_hashtable_add(
-		struct btrfsic_block_link *l,
-		struct btrfsic_block_link_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
-	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
-	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
-	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-
-	BUG_ON(NULL == l->block_ref_to);
-	BUG_ON(NULL == l->block_ref_from);
-	list_add(&l->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
-{
-	list_del(&l->collision_resolving_node);
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
-		struct block_device *bdev_ref_to,
-		u64 dev_bytenr_ref_to,
-		struct block_device *bdev_ref_from,
-		u64 dev_bytenr_ref_from,
-		struct btrfsic_block_link_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
-	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
-	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
-	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
-	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-	struct list_head *elem;
-
-	list_for_each(elem, h->table + hashval) {
-		struct btrfsic_block_link *const l =
-		    list_entry(elem, struct btrfsic_block_link,
-			       collision_resolving_node);
-
-		BUG_ON(NULL == l->block_ref_to);
-		BUG_ON(NULL == l->block_ref_from);
-		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
-		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
-		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
-		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
-			return l;
-	}
-
-	return NULL;
-}
-
-static void btrfsic_dev_state_hashtable_init(
-		struct btrfsic_dev_state_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_dev_state_hashtable_add(
-		struct btrfsic_dev_state *ds,
-		struct btrfsic_dev_state_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)((uintptr_t)ds->bdev)) &
-	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-
-	list_add(&ds->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
-{
-	list_del(&ds->collision_resolving_node);
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
-		struct block_device *bdev,
-		struct btrfsic_dev_state_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)((uintptr_t)bdev)) &
-	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-	struct list_head *elem;
-
-	list_for_each(elem, h->table + hashval) {
-		struct btrfsic_dev_state *const ds =
-		    list_entry(elem, struct btrfsic_dev_state,
-			       collision_resolving_node);
-
-		if (ds->bdev == bdev)
-			return ds;
-	}
-
-	return NULL;
-}
-
-static int btrfsic_process_superblock(struct btrfsic_state *state,
-				      struct btrfs_fs_devices *fs_devices)
-{
-	int ret;
-	struct btrfs_super_block *selected_super;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-	struct btrfsic_dev_state *selected_dev_state = NULL;
-	int pass;
-
-	BUG_ON(NULL == state);
-	selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
-	if (NULL == selected_super) {
-		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-		return -1;
-	}
-
-	list_for_each_entry(device, dev_head, dev_list) {
-		int i;
-		struct btrfsic_dev_state *dev_state;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		dev_state = btrfsic_dev_state_lookup(device->bdev);
-		BUG_ON(NULL == dev_state);
-		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-			ret = btrfsic_process_superblock_dev_mirror(
-					state, dev_state, device, i,
-					&selected_dev_state, selected_super);
-			if (0 != ret && 0 == i) {
-				kfree(selected_super);
-				return ret;
-			}
-		}
-	}
-
-	if (NULL == state->latest_superblock) {
-		printk(KERN_INFO "btrfsic: no superblock found!\n");
-		kfree(selected_super);
-		return -1;
-	}
-
-	state->csum_size = btrfs_super_csum_size(selected_super);
-
-	for (pass = 0; pass < 3; pass++) {
-		int num_copies;
-		int mirror_num;
-		u64 next_bytenr;
-
-		switch (pass) {
-		case 0:
-			next_bytenr = btrfs_super_root(selected_super);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "root@%llu\n",
-				       (unsigned long long)next_bytenr);
-			break;
-		case 1:
-			next_bytenr = btrfs_super_chunk_root(selected_super);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "chunk@%llu\n",
-				       (unsigned long long)next_bytenr);
-			break;
-		case 2:
-			next_bytenr = btrfs_super_log_root(selected_super);
-			if (0 == next_bytenr)
-				continue;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "log@%llu\n",
-				       (unsigned long long)next_bytenr);
-			break;
-		}
-
-		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-				     next_bytenr, PAGE_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
-
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block *next_block;
-			struct btrfsic_block_data_ctx tmp_next_block_ctx;
-			struct btrfsic_block_link *l;
-			struct btrfs_header *hdr;
-
-			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
-						&tmp_next_block_ctx,
-						mirror_num);
-			if (ret) {
-				printk(KERN_INFO "btrfsic:"
-				       " btrfsic_map_block(root @%llu,"
-				       " mirror %d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
-				kfree(selected_super);
-				return -1;
-			}
-
-			next_block = btrfsic_block_hashtable_lookup(
-					tmp_next_block_ctx.dev->bdev,
-					tmp_next_block_ctx.dev_bytenr,
-					&state->block_hashtable);
-			BUG_ON(NULL == next_block);
-
-			l = btrfsic_block_link_hashtable_lookup(
-					tmp_next_block_ctx.dev->bdev,
-					tmp_next_block_ctx.dev_bytenr,
-					state->latest_superblock->dev_state->
-					bdev,
-					state->latest_superblock->dev_bytenr,
-					&state->block_link_hashtable);
-			BUG_ON(NULL == l);
-
-			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-			if (ret < (int)BTRFSIC_BLOCK_SIZE) {
-				printk(KERN_INFO
-				       "btrfsic: read @logical %llu failed!\n",
-				       (unsigned long long)
-				       tmp_next_block_ctx.start);
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				kfree(selected_super);
-				return -1;
-			}
-
-			hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
-			ret = btrfsic_process_metablock(state,
-							next_block,
-							&tmp_next_block_ctx,
-							hdr,
-							BTRFS_MAX_LEVEL + 3, 1);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-		}
-	}
-
-	kfree(selected_super);
-	return ret;
-}
-
-static int btrfsic_process_superblock_dev_mirror(
-		struct btrfsic_state *state,
-		struct btrfsic_dev_state *dev_state,
-		struct btrfs_device *device,
-		int superblock_mirror_num,
-		struct btrfsic_dev_state **selected_dev_state,
-		struct btrfs_super_block *selected_super)
-{
-	struct btrfs_super_block *super_tmp;
-	u64 dev_bytenr;
-	struct buffer_head *bh;
-	struct btrfsic_block *superblock_tmp;
-	int pass;
-	struct block_device *const superblock_bdev = device->bdev;
-
-	/* super block bytenr is always the unmapped device bytenr */
-	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-	bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
-	if (NULL == bh)
-		return -1;
-	super_tmp = (struct btrfs_super_block *)
-	    (bh->b_data + (dev_bytenr & 4095));
-
-	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-	    strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
-		    sizeof(super_tmp->magic)) ||
-	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
-		brelse(bh);
-		return 0;
-	}
-
-	superblock_tmp =
-	    btrfsic_block_hashtable_lookup(superblock_bdev,
-					   dev_bytenr,
-					   &state->block_hashtable);
-	if (NULL == superblock_tmp) {
-		superblock_tmp = btrfsic_block_alloc();
-		if (NULL == superblock_tmp) {
-			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-			brelse(bh);
-			return -1;
-		}
-		/* for superblock, only the dev_bytenr makes sense */
-		superblock_tmp->dev_bytenr = dev_bytenr;
-		superblock_tmp->dev_state = dev_state;
-		superblock_tmp->logical_bytenr = dev_bytenr;
-		superblock_tmp->generation = btrfs_super_generation(super_tmp);
-		superblock_tmp->is_metadata = 1;
-		superblock_tmp->is_superblock = 1;
-		superblock_tmp->is_iodone = 1;
-		superblock_tmp->never_written = 0;
-		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			printk(KERN_INFO "New initial S-block (bdev %p, %s)"
-			       " @%llu (%s/%llu/%d)\n",
-			       superblock_bdev, device->name,
-			       (unsigned long long)dev_bytenr,
-			       dev_state->name,
-			       (unsigned long long)dev_bytenr,
-			       superblock_mirror_num);
-		list_add(&superblock_tmp->all_blocks_node,
-			 &state->all_blocks_list);
-		btrfsic_block_hashtable_add(superblock_tmp,
-					    &state->block_hashtable);
-	}
-
-	/* select the one with the highest generation field */
-	if (btrfs_super_generation(super_tmp) >
-	    state->max_superblock_generation ||
-	    0 == state->max_superblock_generation) {
-		memcpy(selected_super, super_tmp, sizeof(*selected_super));
-		*selected_dev_state = dev_state;
-		state->max_superblock_generation =
-		    btrfs_super_generation(super_tmp);
-		state->latest_superblock = superblock_tmp;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		u64 next_bytenr;
-		int num_copies;
-		int mirror_num;
-		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key;
-
-		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
-		tmp_disk_key.offset = 0;
-		switch (pass) {
-		case 0:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
-			additional_string = "initial root ";
-			next_bytenr = btrfs_super_root(super_tmp);
-			break;
-		case 1:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
-			additional_string = "initial chunk ";
-			next_bytenr = btrfs_super_chunk_root(super_tmp);
-			break;
-		case 2:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
-			additional_string = "initial log ";
-			next_bytenr = btrfs_super_log_root(super_tmp);
-			if (0 == next_bytenr)
-				continue;
-			break;
-		}
-
-		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-				     next_bytenr, PAGE_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block *next_block;
-			struct btrfsic_block_data_ctx tmp_next_block_ctx;
-			struct btrfsic_block_link *l;
-
-			if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
-					      &tmp_next_block_ctx,
-					      mirror_num)) {
-				printk(KERN_INFO "btrfsic: btrfsic_map_block("
-				       "bytenr @%llu, mirror %d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
-				brelse(bh);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state, &tmp_next_block_ctx,
-					additional_string, 1, 1, 0,
-					mirror_num, NULL);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				brelse(bh);
-				return -1;
-			}
-
-			next_block->disk_key = tmp_disk_key;
-			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
-			l = btrfsic_block_link_lookup_or_add(
-					state, &tmp_next_block_ctx,
-					next_block, superblock_tmp,
-					BTRFSIC_GENERATION_UNKNOWN);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-			if (NULL == l) {
-				brelse(bh);
-				return -1;
-			}
-		}
-	}
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
-		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
-
-	brelse(bh);
-	return 0;
-}
-
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
-{
-	struct btrfsic_stack_frame *sf;
-
-	sf = kzalloc(sizeof(*sf), GFP_NOFS);
-	if (NULL == sf)
-		printk(KERN_INFO "btrfsic: alloc memory failed!\n");
-	else
-		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
-	return sf;
-}
-
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
-{
-	BUG_ON(!(NULL == sf ||
-		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
-	kfree(sf);
-}
-
-static int btrfsic_process_metablock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const first_block,
-		struct btrfsic_block_data_ctx *const first_block_ctx,
-		struct btrfs_header *const first_hdr,
-		int first_limit_nesting, int force_iodone_flag)
-{
-	struct btrfsic_stack_frame initial_stack_frame = { 0 };
-	struct btrfsic_stack_frame *sf;
-	struct btrfsic_stack_frame *next_stack;
-
-	sf = &initial_stack_frame;
-	sf->error = 0;
-	sf->i = -1;
-	sf->limit_nesting = first_limit_nesting;
-	sf->block = first_block;
-	sf->block_ctx = first_block_ctx;
-	sf->next_block = NULL;
-	sf->hdr = first_hdr;
-	sf->prev = NULL;
-
-continue_with_new_stack_frame:
-	sf->block->generation = le64_to_cpu(sf->hdr->generation);
-	if (0 == sf->hdr->level) {
-		struct btrfs_leaf *const leafhdr =
-		    (struct btrfs_leaf *)sf->hdr;
-
-		if (-1 == sf->i) {
-			sf->nr = le32_to_cpu(leafhdr->header.nritems);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO
-				       "leaf %llu items %d generation %llu"
-				       " owner %llu\n",
-				       (unsigned long long)
-				       sf->block_ctx->start,
-				       sf->nr,
-				       (unsigned long long)
-				       le64_to_cpu(leafhdr->header.generation),
-				       (unsigned long long)
-				       le64_to_cpu(leafhdr->header.owner));
-		}
-
-continue_with_current_leaf_stack_frame:
-		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
-			sf->i++;
-			sf->num_copies = 0;
-		}
-
-		if (sf->i < sf->nr) {
-			struct btrfs_item *disk_item = leafhdr->items + sf->i;
-			struct btrfs_disk_key *disk_key = &disk_item->key;
-			u8 type;
-			const u32 item_offset = le32_to_cpu(disk_item->offset);
-
-			type = disk_key->type;
-
-			if (BTRFS_ROOT_ITEM_KEY == type) {
-				const struct btrfs_root_item *const root_item =
-				    (struct btrfs_root_item *)
-				    (sf->block_ctx->data +
-				     offsetof(struct btrfs_leaf, items) +
-				     item_offset);
-				const u64 next_bytenr =
-				    le64_to_cpu(root_item->bytenr);
-
-				sf->error =
-				    btrfsic_create_link_to_next_block(
-						state,
-						sf->block,
-						sf->block_ctx,
-						next_bytenr,
-						sf->limit_nesting,
-						&sf->next_block_ctx,
-						&sf->next_block,
-						force_iodone_flag,
-						&sf->num_copies,
-						&sf->mirror_num,
-						disk_key,
-						le64_to_cpu(root_item->
-						generation));
-				if (sf->error)
-					goto one_stack_frame_backwards;
-
-				if (NULL != sf->next_block) {
-					struct btrfs_header *const next_hdr =
-					    (struct btrfs_header *)
-					    sf->next_block_ctx.data;
-
-					next_stack =
-					    btrfsic_stack_frame_alloc();
-					if (NULL == next_stack) {
-						btrfsic_release_block_ctx(
-								&sf->
-								next_block_ctx);
-						goto one_stack_frame_backwards;
-					}
-
-					next_stack->i = -1;
-					next_stack->block = sf->next_block;
-					next_stack->block_ctx =
-					    &sf->next_block_ctx;
-					next_stack->next_block = NULL;
-					next_stack->hdr = next_hdr;
-					next_stack->limit_nesting =
-					    sf->limit_nesting - 1;
-					next_stack->prev = sf;
-					sf = next_stack;
-					goto continue_with_new_stack_frame;
-				}
-			} else if (BTRFS_EXTENT_DATA_KEY == type &&
-				   state->include_extent_data) {
-				sf->error = btrfsic_handle_extent_data(
-						state,
-						sf->block,
-						sf->block_ctx,
-						item_offset,
-						force_iodone_flag);
-				if (sf->error)
-					goto one_stack_frame_backwards;
-			}
-
-			goto continue_with_current_leaf_stack_frame;
-		}
-	} else {
-		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
-
-		if (-1 == sf->i) {
-			sf->nr = le32_to_cpu(nodehdr->header.nritems);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO "node %llu level %d items %d"
-				       " generation %llu owner %llu\n",
-				       (unsigned long long)
-				       sf->block_ctx->start,
-				       nodehdr->header.level, sf->nr,
-				       (unsigned long long)
-				       le64_to_cpu(nodehdr->header.generation),
-				       (unsigned long long)
-				       le64_to_cpu(nodehdr->header.owner));
-		}
-
-continue_with_current_node_stack_frame:
-		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
-			sf->i++;
-			sf->num_copies = 0;
-		}
-
-		if (sf->i < sf->nr) {
-			struct btrfs_key_ptr *disk_key_ptr =
-			    nodehdr->ptrs + sf->i;
-			const u64 next_bytenr =
-			    le64_to_cpu(disk_key_ptr->blockptr);
-
-			sf->error = btrfsic_create_link_to_next_block(
-					state,
-					sf->block,
-					sf->block_ctx,
-					next_bytenr,
-					sf->limit_nesting,
-					&sf->next_block_ctx,
-					&sf->next_block,
-					force_iodone_flag,
-					&sf->num_copies,
-					&sf->mirror_num,
-					&disk_key_ptr->key,
-					le64_to_cpu(disk_key_ptr->generation));
-			if (sf->error)
-				goto one_stack_frame_backwards;
-
-			if (NULL != sf->next_block) {
-				struct btrfs_header *const next_hdr =
-				    (struct btrfs_header *)
-				    sf->next_block_ctx.data;
-
-				next_stack = btrfsic_stack_frame_alloc();
-				if (NULL == next_stack)
-					goto one_stack_frame_backwards;
-
-				next_stack->i = -1;
-				next_stack->block = sf->next_block;
-				next_stack->block_ctx = &sf->next_block_ctx;
-				next_stack->next_block = NULL;
-				next_stack->hdr = next_hdr;
-				next_stack->limit_nesting =
-				    sf->limit_nesting - 1;
-				next_stack->prev = sf;
-				sf = next_stack;
-				goto continue_with_new_stack_frame;
-			}
-
-			goto continue_with_current_node_stack_frame;
-		}
-	}
-
-one_stack_frame_backwards:
-	if (NULL != sf->prev) {
-		struct btrfsic_stack_frame *const prev = sf->prev;
-
-		/* the one for the initial block is freed in the caller */
-		btrfsic_release_block_ctx(sf->block_ctx);
-
-		if (sf->error) {
-			prev->error = sf->error;
-			btrfsic_stack_frame_free(sf);
-			sf = prev;
-			goto one_stack_frame_backwards;
-		}
-
-		btrfsic_stack_frame_free(sf);
-		sf = prev;
-		goto continue_with_new_stack_frame;
-	} else {
-		BUG_ON(&initial_stack_frame != sf);
-	}
-
-	return sf->error;
-}
-
-static int btrfsic_create_link_to_next_block(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx *block_ctx,
-		u64 next_bytenr,
-		int limit_nesting,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block **next_blockp,
-		int force_iodone_flag,
-		int *num_copiesp, int *mirror_nump,
-		struct btrfs_disk_key *disk_key,
-		u64 parent_generation)
-{
-	struct btrfsic_block *next_block = NULL;
-	int ret;
-	struct btrfsic_block_link *l;
-	int did_alloc_block_link;
-	int block_was_created;
-
-	*next_blockp = NULL;
-	if (0 == *num_copiesp) {
-		*num_copiesp =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-				     next_bytenr, PAGE_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, *num_copiesp);
-		*mirror_nump = 1;
-	}
-
-	if (*mirror_nump > *num_copiesp)
-		return 0;
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-		printk(KERN_INFO
-		       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
-		       *mirror_nump);
-	ret = btrfsic_map_block(state, next_bytenr,
-				BTRFSIC_BLOCK_SIZE,
-				next_block_ctx, *mirror_nump);
-	if (ret) {
-		printk(KERN_INFO
-		       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-		       (unsigned long long)next_bytenr, *mirror_nump);
-		btrfsic_release_block_ctx(next_block_ctx);
-		*next_blockp = NULL;
-		return -1;
-	}
-
-	next_block = btrfsic_block_lookup_or_add(state,
-						 next_block_ctx, "referenced ",
-						 1, force_iodone_flag,
-						 !force_iodone_flag,
-						 *mirror_nump,
-						 &block_was_created);
-	if (NULL == next_block) {
-		btrfsic_release_block_ctx(next_block_ctx);
-		*next_blockp = NULL;
-		return -1;
-	}
-	if (block_was_created) {
-		l = NULL;
-		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
-	} else {
-		if (next_block->logical_bytenr != next_bytenr &&
-		    !(!next_block->is_metadata &&
-		      0 == next_block->logical_bytenr)) {
-			printk(KERN_INFO
-			       "Referenced block @%llu (%s/%llu/%d)"
-			       " found in hash table, %c,"
-			       " bytenr mismatch (!= stored %llu).\n",
-			       (unsigned long long)next_bytenr,
-			       next_block_ctx->dev->name,
-			       (unsigned long long)next_block_ctx->dev_bytenr,
-			       *mirror_nump,
-			       btrfsic_get_block_type(state, next_block),
-			       (unsigned long long)next_block->logical_bytenr);
-		} else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "Referenced block @%llu (%s/%llu/%d)"
-			       " found in hash table, %c.\n",
-			       (unsigned long long)next_bytenr,
-			       next_block_ctx->dev->name,
-			       (unsigned long long)next_block_ctx->dev_bytenr,
-			       *mirror_nump,
-			       btrfsic_get_block_type(state, next_block));
-		next_block->logical_bytenr = next_bytenr;
-
-		next_block->mirror_num = *mirror_nump;
-		l = btrfsic_block_link_hashtable_lookup(
-				next_block_ctx->dev->bdev,
-				next_block_ctx->dev_bytenr,
-				block_ctx->dev->bdev,
-				block_ctx->dev_bytenr,
-				&state->block_link_hashtable);
-	}
-
-	next_block->disk_key = *disk_key;
-	if (NULL == l) {
-		l = btrfsic_block_link_alloc();
-		if (NULL == l) {
-			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-			btrfsic_release_block_ctx(next_block_ctx);
-			*next_blockp = NULL;
-			return -1;
-		}
-
-		did_alloc_block_link = 1;
-		l->block_ref_to = next_block;
-		l->block_ref_from = block;
-		l->ref_cnt = 1;
-		l->parent_generation = parent_generation;
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-
-		list_add(&l->node_ref_to, &block->ref_to_list);
-		list_add(&l->node_ref_from, &next_block->ref_from_list);
-
-		btrfsic_block_link_hashtable_add(l,
-						 &state->block_link_hashtable);
-	} else {
-		did_alloc_block_link = 0;
-		if (0 == limit_nesting) {
-			l->ref_cnt++;
-			l->parent_generation = parent_generation;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_add_link(state, l);
-		}
-	}
-
-	if (limit_nesting > 0 && did_alloc_block_link) {
-		ret = btrfsic_read_block(state, next_block_ctx);
-		if (ret < (int)BTRFSIC_BLOCK_SIZE) {
-			printk(KERN_INFO
-			       "btrfsic: read block @logical %llu failed!\n",
-			       (unsigned long long)next_bytenr);
-			btrfsic_release_block_ctx(next_block_ctx);
-			*next_blockp = NULL;
-			return -1;
-		}
-
-		*next_blockp = next_block;
-	} else {
-		*next_blockp = NULL;
-	}
-	(*mirror_nump)++;
-
-	return 0;
-}
-
-static int btrfsic_handle_extent_data(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx *block_ctx,
-		u32 item_offset, int force_iodone_flag)
-{
-	int ret;
-	struct btrfs_file_extent_item *file_extent_item =
-	    (struct btrfs_file_extent_item *)(block_ctx->data +
-					      offsetof(struct btrfs_leaf,
-						       items) + item_offset);
-	u64 next_bytenr =
-	    le64_to_cpu(file_extent_item->disk_bytenr) +
-	    le64_to_cpu(file_extent_item->offset);
-	u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
-	u64 generation = le64_to_cpu(file_extent_item->generation);
-	struct btrfsic_block_link *l;
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
-		       " offset = %llu, num_bytes = %llu\n",
-		       file_extent_item->type,
-		       (unsigned long long)
-		       le64_to_cpu(file_extent_item->disk_bytenr),
-		       (unsigned long long)
-		       le64_to_cpu(file_extent_item->offset),
-		       (unsigned long long)
-		       le64_to_cpu(file_extent_item->num_bytes));
-	if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
-	    ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
-		return 0;
-	while (num_bytes > 0) {
-		u32 chunk_len;
-		int num_copies;
-		int mirror_num;
-
-		if (num_bytes > BTRFSIC_BLOCK_SIZE)
-			chunk_len = BTRFSIC_BLOCK_SIZE;
-		else
-			chunk_len = num_bytes;
-
-		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-				     next_bytenr, PAGE_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block_data_ctx next_block_ctx;
-			struct btrfsic_block *next_block;
-			int block_was_created;
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO "btrfsic_handle_extent_data("
-				       "mirror_num=%d)\n", mirror_num);
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-				printk(KERN_INFO
-				       "\tdisk_bytenr = %llu, num_bytes %u\n",
-				       (unsigned long long)next_bytenr,
-				       chunk_len);
-			ret = btrfsic_map_block(state, next_bytenr,
-						chunk_len, &next_block_ctx,
-						mirror_num);
-			if (ret) {
-				printk(KERN_INFO
-				       "btrfsic: btrfsic_map_block(@%llu,"
-				       " mirror=%d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state,
-					&next_block_ctx,
-					"referenced ",
-					0,
-					force_iodone_flag,
-					!force_iodone_flag,
-					mirror_num,
-					&block_was_created);
-			if (NULL == next_block) {
-				printk(KERN_INFO
-				       "btrfsic: error, kmalloc failed!\n");
-				btrfsic_release_block_ctx(&next_block_ctx);
-				return -1;
-			}
-			if (!block_was_created) {
-				if (next_block->logical_bytenr != next_bytenr &&
-				    !(!next_block->is_metadata &&
-				      0 == next_block->logical_bytenr)) {
-					printk(KERN_INFO
-					       "Referenced block"
-					       " @%llu (%s/%llu/%d)"
-					       " found in hash table, D,"
-					       " bytenr mismatch"
-					       " (!= stored %llu).\n",
-					       (unsigned long long)next_bytenr,
-					       next_block_ctx.dev->name,
-					       (unsigned long long)
-					       next_block_ctx.dev_bytenr,
-					       mirror_num,
-					       (unsigned long long)
-					       next_block->logical_bytenr);
-				}
-				next_block->logical_bytenr = next_bytenr;
-				next_block->mirror_num = mirror_num;
-			}
-
-			l = btrfsic_block_link_lookup_or_add(state,
-							     &next_block_ctx,
-							     next_block, block,
-							     generation);
-			btrfsic_release_block_ctx(&next_block_ctx);
-			if (NULL == l)
-				return -1;
-		}
-
-		next_bytenr += chunk_len;
-		num_bytes -= chunk_len;
-	}
-
-	return 0;
-}
-
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
-			     struct btrfsic_block_data_ctx *block_ctx_out,
-			     int mirror_num)
-{
-	int ret;
-	u64 length;
-	struct btrfs_bio *multi = NULL;
-	struct btrfs_device *device;
-
-	length = len;
-	ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
-			      bytenr, &length, &multi, mirror_num);
-
-	device = multi->stripes[0].dev;
-	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
-	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
-	block_ctx_out->start = bytenr;
-	block_ctx_out->len = len;
-	block_ctx_out->data = NULL;
-	block_ctx_out->bh = NULL;
-
-	if (0 == ret)
-		kfree(multi);
-	if (NULL == block_ctx_out->dev) {
-		ret = -ENXIO;
-		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
-	}
-
-	return ret;
-}
-
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
-				  u32 len, struct block_device *bdev,
-				  struct btrfsic_block_data_ctx *block_ctx_out)
-{
-	block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
-	block_ctx_out->dev_bytenr = bytenr;
-	block_ctx_out->start = bytenr;
-	block_ctx_out->len = len;
-	block_ctx_out->data = NULL;
-	block_ctx_out->bh = NULL;
-	if (NULL != block_ctx_out->dev) {
-		return 0;
-	} else {
-		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
-		return -ENXIO;
-	}
-}
-
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
-{
-	if (NULL != block_ctx->bh) {
-		brelse(block_ctx->bh);
-		block_ctx->bh = NULL;
-	}
-}
-
-static int btrfsic_read_block(struct btrfsic_state *state,
-			      struct btrfsic_block_data_ctx *block_ctx)
-{
-	block_ctx->bh = NULL;
-	if (block_ctx->dev_bytenr & 4095) {
-		printk(KERN_INFO
-		       "btrfsic: read_block() with unaligned bytenr %llu\n",
-		       (unsigned long long)block_ctx->dev_bytenr);
-		return -1;
-	}
-	if (block_ctx->len > 4096) {
-		printk(KERN_INFO
-		       "btrfsic: read_block() with too huge size %d\n",
-		       block_ctx->len);
-		return -1;
-	}
-
-	block_ctx->bh = __bread(block_ctx->dev->bdev,
-				block_ctx->dev_bytenr >> 12, 4096);
-	if (NULL == block_ctx->bh)
-		return -1;
-	block_ctx->data = block_ctx->bh->b_data;
-
-	return block_ctx->len;
-}
-
-static void btrfsic_dump_database(struct btrfsic_state *state)
-{
-	struct list_head *elem_all;
-
-	BUG_ON(NULL == state);
-
-	printk(KERN_INFO "all_blocks_list:\n");
-	list_for_each(elem_all, &state->all_blocks_list) {
-		const struct btrfsic_block *const b_all =
-		    list_entry(elem_all, struct btrfsic_block,
-			       all_blocks_node);
-		struct list_head *elem_ref_to;
-		struct list_head *elem_ref_from;
-
-		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
-		       btrfsic_get_block_type(state, b_all),
-		       (unsigned long long)b_all->logical_bytenr,
-		       b_all->dev_state->name,
-		       (unsigned long long)b_all->dev_bytenr,
-		       b_all->mirror_num);
-
-		list_for_each(elem_ref_to, &b_all->ref_to_list) {
-			const struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_to,
-				       struct btrfsic_block_link,
-				       node_ref_to);
-
-			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
-			       " refers %u* to"
-			       " %c @%llu (%s/%llu/%d)\n",
-			       btrfsic_get_block_type(state, b_all),
-			       (unsigned long long)b_all->logical_bytenr,
-			       b_all->dev_state->name,
-			       (unsigned long long)b_all->dev_bytenr,
-			       b_all->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-		}
-
-		list_for_each(elem_ref_from, &b_all->ref_from_list) {
-			const struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_from,
-				       struct btrfsic_block_link,
-				       node_ref_from);
-
-			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
-			       " is ref %u* from"
-			       " %c @%llu (%s/%llu/%d)\n",
-			       btrfsic_get_block_type(state, b_all),
-			       (unsigned long long)b_all->logical_bytenr,
-			       b_all->dev_state->name,
-			       (unsigned long long)b_all->dev_bytenr,
-			       b_all->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_from),
-			       (unsigned long long)
-			       l->block_ref_from->logical_bytenr,
-			       l->block_ref_from->dev_state->name,
-			       (unsigned long long)
-			       l->block_ref_from->dev_bytenr,
-			       l->block_ref_from->mirror_num);
-		}
-
-		printk(KERN_INFO "\n");
-	}
-}
-
-/*
- * Test whether the disk block contains a tree block (leaf or node)
- * (note that this test fails for the super block)
- */
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-				     const u8 *data, unsigned int size)
-{
-	struct btrfs_header *h;
-	u8 csum[BTRFS_CSUM_SIZE];
-	u32 crc = ~(u32)0;
-	int fail = 0;
-	int crc_fail = 0;
-
-	h = (struct btrfs_header *)data;
-
-	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
-		fail++;
-
-	crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
-	btrfs_csum_final(crc, csum);
-	if (memcmp(csum, h->csum, state->csum_size))
-		crc_fail++;
-
-	return fail || crc_fail;
-}
-
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-					  u64 dev_bytenr,
-					  u8 *mapped_data, unsigned int len,
-					  struct bio *bio,
-					  int *bio_is_patched,
-					  struct buffer_head *bh,
-					  int submit_bio_bh_rw)
-{
-	int is_metadata;
-	struct btrfsic_block *block;
-	struct btrfsic_block_data_ctx block_ctx;
-	int ret;
-	struct btrfsic_state *state = dev_state->state;
-	struct block_device *bdev = dev_state->bdev;
-
-	WARN_ON(len > PAGE_SIZE);
-	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
-	if (NULL != bio_is_patched)
-		*bio_is_patched = 0;
-
-	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
-					       &state->block_hashtable);
-	if (NULL != block) {
-		u64 bytenr;
-		struct list_head *elem_ref_to;
-		struct list_head *tmp_ref_to;
-
-		if (block->is_superblock) {
-			bytenr = le64_to_cpu(((struct btrfs_super_block *)
-					      mapped_data)->bytenr);
-			is_metadata = 1;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
-				printk(KERN_INFO
-				       "[before new superblock is written]:\n");
-				btrfsic_dump_tree_sub(state, block, 0);
-			}
-		}
-		if (is_metadata) {
-			if (!block->is_superblock) {
-				bytenr = le64_to_cpu(((struct btrfs_header *)
-						      mapped_data)->bytenr);
-				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
-							       dev_state,
-							       dev_bytenr,
-							       mapped_data);
-			}
-			if (block->logical_bytenr != bytenr) {
-				printk(KERN_INFO
-				       "Written block @%llu (%s/%llu/%d)"
-				       " found in hash table, %c,"
-				       " bytenr mismatch"
-				       " (!= stored %llu).\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr,
-				       block->mirror_num,
-				       btrfsic_get_block_type(state, block),
-				       (unsigned long long)
-				       block->logical_bytenr);
-				block->logical_bytenr = bytenr;
-			} else if (state->print_mask &
-				   BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO
-				       "Written block @%llu (%s/%llu/%d)"
-				       " found in hash table, %c.\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr,
-				       block->mirror_num,
-				       btrfsic_get_block_type(state, block));
-		} else {
-			bytenr = block->logical_bytenr;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO
-				       "Written block @%llu (%s/%llu/%d)"
-				       " found in hash table, %c.\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr,
-				       block->mirror_num,
-				       btrfsic_get_block_type(state, block));
-		}
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "ref_to_list: %cE, ref_from_list: %cE\n",
-			       list_empty(&block->ref_to_list) ? ' ' : '!',
-			       list_empty(&block->ref_from_list) ? ' ' : '!');
-		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
-			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
-			       " @%llu (%s/%llu/%d), old(gen=%llu,"
-			       " objectid=%llu, type=%d, offset=%llu),"
-			       " new(gen=%llu),"
-			       " which is referenced by most recent superblock"
-			       " (superblockgen=%llu)!\n",
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)bytenr,
-			       dev_state->name,
-			       (unsigned long long)dev_bytenr,
-			       block->mirror_num,
-			       (unsigned long long)block->generation,
-			       (unsigned long long)
-			       le64_to_cpu(block->disk_key.objectid),
-			       block->disk_key.type,
-			       (unsigned long long)
-			       le64_to_cpu(block->disk_key.offset),
-			       (unsigned long long)
-			       le64_to_cpu(((struct btrfs_header *)
-					    mapped_data)->generation),
-			       (unsigned long long)
-			       state->max_superblock_generation);
-			btrfsic_dump_tree(state);
-		}
-
-		if (!block->is_iodone && !block->never_written) {
-			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
-			       " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
-			       " which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)bytenr,
-			       dev_state->name,
-			       (unsigned long long)dev_bytenr,
-			       block->mirror_num,
-			       (unsigned long long)block->generation,
-			       (unsigned long long)
-			       le64_to_cpu(((struct btrfs_header *)
-					    mapped_data)->generation));
-			/* it would not be safe to go on */
-			btrfsic_dump_tree(state);
-			return;
-		}
-
-		/*
-		 * Clear all references of this block. Do not free
-		 * the block itself even if is not referenced anymore
-		 * because it still carries valueable information
-		 * like whether it was ever written and IO completed.
-		 */
-		list_for_each_safe(elem_ref_to, tmp_ref_to,
-				   &block->ref_to_list) {
-			struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_to,
-				       struct btrfsic_block_link,
-				       node_ref_to);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_rem_link(state, l);
-			l->ref_cnt--;
-			if (0 == l->ref_cnt) {
-				list_del(&l->node_ref_to);
-				list_del(&l->node_ref_from);
-				btrfsic_block_link_hashtable_remove(l);
-				btrfsic_block_link_free(l);
-			}
-		}
-
-		if (block->is_superblock)
-			ret = btrfsic_map_superblock(state, bytenr, len,
-						     bdev, &block_ctx);
-		else
-			ret = btrfsic_map_block(state, bytenr, len,
-						&block_ctx, 0);
-		if (ret) {
-			printk(KERN_INFO
-			       "btrfsic: btrfsic_map_block(root @%llu)"
-			       " failed!\n", (unsigned long long)bytenr);
-			return;
-		}
-		block_ctx.data = mapped_data;
-		/* the following is required in case of writes to mirrors,
-		 * use the same that was used for the lookup */
-		block_ctx.dev = dev_state;
-		block_ctx.dev_bytenr = dev_bytenr;
-
-		if (is_metadata || state->include_extent_data) {
-			block->never_written = 0;
-			block->iodone_w_error = 0;
-			if (NULL != bio) {
-				block->is_iodone = 0;
-				BUG_ON(NULL == bio_is_patched);
-				if (!*bio_is_patched) {
-					block->orig_bio_bh_private =
-					    bio->bi_private;
-					block->orig_bio_bh_end_io.bio =
-					    bio->bi_end_io;
-					block->next_in_same_bio = NULL;
-					bio->bi_private = block;
-					bio->bi_end_io = btrfsic_bio_end_io;
-					*bio_is_patched = 1;
-				} else {
-					struct btrfsic_block *chained_block =
-					    (struct btrfsic_block *)
-					    bio->bi_private;
-
-					BUG_ON(NULL == chained_block);
-					block->orig_bio_bh_private =
-					    chained_block->orig_bio_bh_private;
-					block->orig_bio_bh_end_io.bio =
-					    chained_block->orig_bio_bh_end_io.
-					    bio;
-					block->next_in_same_bio = chained_block;
-					bio->bi_private = block;
-				}
-			} else if (NULL != bh) {
-				block->is_iodone = 0;
-				block->orig_bio_bh_private = bh->b_private;
-				block->orig_bio_bh_end_io.bh = bh->b_end_io;
-				block->next_in_same_bio = NULL;
-				bh->b_private = block;
-				bh->b_end_io = btrfsic_bh_end_io;
-			} else {
-				block->is_iodone = 1;
-				block->orig_bio_bh_private = NULL;
-				block->orig_bio_bh_end_io.bio = NULL;
-				block->next_in_same_bio = NULL;
-			}
-		}
-
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = submit_bio_bh_rw;
-		if (is_metadata) {
-			block->logical_bytenr = bytenr;
-			block->is_metadata = 1;
-			if (block->is_superblock) {
-				ret = btrfsic_process_written_superblock(
-						state,
-						block,
-						(struct btrfs_super_block *)
-						mapped_data);
-				if (state->print_mask &
-				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
-					printk(KERN_INFO
-					"[after new superblock is written]:\n");
-					btrfsic_dump_tree_sub(state, block, 0);
-				}
-			} else {
-				block->mirror_num = 0;	/* unknown */
-				ret = btrfsic_process_metablock(
-						state,
-						block,
-						&block_ctx,
-						(struct btrfs_header *)
-						block_ctx.data,
-						0, 0);
-			}
-			if (ret)
-				printk(KERN_INFO
-				       "btrfsic: btrfsic_process_metablock"
-				       "(root @%llu) failed!\n",
-				       (unsigned long long)dev_bytenr);
-		} else {
-			block->is_metadata = 0;
-			block->mirror_num = 0;	/* unknown */
-			block->generation = BTRFSIC_GENERATION_UNKNOWN;
-			if (!state->include_extent_data
-			    && list_empty(&block->ref_from_list)) {
-				/*
-				 * disk block is overwritten with extent
-				 * data (not meta data) and we are configured
-				 * to not include extent data: take the
-				 * chance and free the block's memory
-				 */
-				btrfsic_block_hashtable_remove(block);
-				list_del(&block->all_blocks_node);
-				btrfsic_block_free(block);
-			}
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	} else {
-		/* block has not been found in hash table */
-		u64 bytenr;
-
-		if (!is_metadata) {
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO "Written block (%s/%llu/?)"
-				       " !found in hash table, D.\n",
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr);
-			if (!state->include_extent_data)
-				return;	/* ignore that written D block */
-
-			/* this is getting ugly for the
-			 * include_extent_data case... */
-			bytenr = 0;	/* unknown */
-			block_ctx.start = bytenr;
-			block_ctx.len = len;
-			block_ctx.bh = NULL;
-		} else {
-			bytenr = le64_to_cpu(((struct btrfs_header *)
-					      mapped_data)->bytenr);
-			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
-						       dev_bytenr,
-						       mapped_data);
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO
-				       "Written block @%llu (%s/%llu/?)"
-				       " !found in hash table, M.\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr);
-
-			ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
-						0);
-			if (ret) {
-				printk(KERN_INFO
-				       "btrfsic: btrfsic_map_block(root @%llu)"
-				       " failed!\n",
-				       (unsigned long long)dev_bytenr);
-				return;
-			}
-		}
-		block_ctx.data = mapped_data;
-		/* the following is required in case of writes to mirrors,
-		 * use the same that was used for the lookup */
-		block_ctx.dev = dev_state;
-		block_ctx.dev_bytenr = dev_bytenr;
-
-		block = btrfsic_block_alloc();
-		if (NULL == block) {
-			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-			btrfsic_release_block_ctx(&block_ctx);
-			return;
-		}
-		block->dev_state = dev_state;
-		block->dev_bytenr = dev_bytenr;
-		block->logical_bytenr = bytenr;
-		block->is_metadata = is_metadata;
-		block->never_written = 0;
-		block->iodone_w_error = 0;
-		block->mirror_num = 0;	/* unknown */
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = submit_bio_bh_rw;
-		if (NULL != bio) {
-			block->is_iodone = 0;
-			BUG_ON(NULL == bio_is_patched);
-			if (!*bio_is_patched) {
-				block->orig_bio_bh_private = bio->bi_private;
-				block->orig_bio_bh_end_io.bio = bio->bi_end_io;
-				block->next_in_same_bio = NULL;
-				bio->bi_private = block;
-				bio->bi_end_io = btrfsic_bio_end_io;
-				*bio_is_patched = 1;
-			} else {
-				struct btrfsic_block *chained_block =
-				    (struct btrfsic_block *)
-				    bio->bi_private;
-
-				BUG_ON(NULL == chained_block);
-				block->orig_bio_bh_private =
-				    chained_block->orig_bio_bh_private;
-				block->orig_bio_bh_end_io.bio =
-				    chained_block->orig_bio_bh_end_io.bio;
-				block->next_in_same_bio = chained_block;
-				bio->bi_private = block;
-			}
-		} else if (NULL != bh) {
-			block->is_iodone = 0;
-			block->orig_bio_bh_private = bh->b_private;
-			block->orig_bio_bh_end_io.bh = bh->b_end_io;
-			block->next_in_same_bio = NULL;
-			bh->b_private = block;
-			bh->b_end_io = btrfsic_bh_end_io;
-		} else {
-			block->is_iodone = 1;
-			block->orig_bio_bh_private = NULL;
-			block->orig_bio_bh_end_io.bio = NULL;
-			block->next_in_same_bio = NULL;
-		}
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "New written %c-block @%llu (%s/%llu/%d)\n",
-			       is_metadata ? 'M' : 'D',
-			       (unsigned long long)block->logical_bytenr,
-			       block->dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num);
-		list_add(&block->all_blocks_node, &state->all_blocks_list);
-		btrfsic_block_hashtable_add(block, &state->block_hashtable);
-
-		if (is_metadata) {
-			ret = btrfsic_process_metablock(state, block,
-							&block_ctx,
-							(struct btrfs_header *)
-							block_ctx.data, 0, 0);
-			if (ret)
-				printk(KERN_INFO
-				       "btrfsic: process_metablock(root @%llu)"
-				       " failed!\n",
-				       (unsigned long long)dev_bytenr);
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	}
-}
-
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
-{
-	struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
-	int iodone_w_error;
-
-	/* mutex is not held! This is not save if IO is not yet completed
-	 * on umount */
-	iodone_w_error = 0;
-	if (bio_error_status)
-		iodone_w_error = 1;
-
-	BUG_ON(NULL == block);
-	bp->bi_private = block->orig_bio_bh_private;
-	bp->bi_end_io = block->orig_bio_bh_end_io.bio;
-
-	do {
-		struct btrfsic_block *next_block;
-		struct btrfsic_dev_state *const dev_state = block->dev_state;
-
-		if ((dev_state->state->print_mask &
-		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-			printk(KERN_INFO
-			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
-			       bio_error_status,
-			       btrfsic_get_block_type(dev_state->state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num);
-		next_block = block->next_in_same_bio;
-		block->iodone_w_error = iodone_w_error;
-		if (block->submit_bio_bh_rw & REQ_FLUSH) {
-			dev_state->last_flush_gen++;
-			if ((dev_state->state->print_mask &
-			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-				printk(KERN_INFO
-				       "bio_end_io() new %s flush_gen=%llu\n",
-				       dev_state->name,
-				       (unsigned long long)
-				       dev_state->last_flush_gen);
-		}
-		if (block->submit_bio_bh_rw & REQ_FUA)
-			block->flush_gen = 0; /* FUA completed means block is
-					       * on disk */
-		block->is_iodone = 1; /* for FLUSH, this releases the block */
-		block = next_block;
-	} while (NULL != block);
-
-	bp->bi_end_io(bp, bio_error_status);
-}
-
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
-{
-	struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
-	int iodone_w_error = !uptodate;
-	struct btrfsic_dev_state *dev_state;
-
-	BUG_ON(NULL == block);
-	dev_state = block->dev_state;
-	if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-		printk(KERN_INFO
-		       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
-		       iodone_w_error,
-		       btrfsic_get_block_type(dev_state->state, block),
-		       (unsigned long long)block->logical_bytenr,
-		       block->dev_state->name,
-		       (unsigned long long)block->dev_bytenr,
-		       block->mirror_num);
-
-	block->iodone_w_error = iodone_w_error;
-	if (block->submit_bio_bh_rw & REQ_FLUSH) {
-		dev_state->last_flush_gen++;
-		if ((dev_state->state->print_mask &
-		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-			printk(KERN_INFO
-			       "bh_end_io() new %s flush_gen=%llu\n",
-			       dev_state->name,
-			       (unsigned long long)dev_state->last_flush_gen);
-	}
-	if (block->submit_bio_bh_rw & REQ_FUA)
-		block->flush_gen = 0; /* FUA completed means block is on disk */
-
-	bh->b_private = block->orig_bio_bh_private;
-	bh->b_end_io = block->orig_bio_bh_end_io.bh;
-	block->is_iodone = 1; /* for FLUSH, this releases the block */
-	bh->b_end_io(bh, uptodate);
-}
-
-static int btrfsic_process_written_superblock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const superblock,
-		struct btrfs_super_block *const super_hdr)
-{
-	int pass;
-
-	superblock->generation = btrfs_super_generation(super_hdr);
-	if (!(superblock->generation > state->max_superblock_generation ||
-	      0 == state->max_superblock_generation)) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			printk(KERN_INFO
-			       "btrfsic: superblock @%llu (%s/%llu/%d)"
-			       " with old gen %llu <= %llu\n",
-			       (unsigned long long)superblock->logical_bytenr,
-			       superblock->dev_state->name,
-			       (unsigned long long)superblock->dev_bytenr,
-			       superblock->mirror_num,
-			       (unsigned long long)
-			       btrfs_super_generation(super_hdr),
-			       (unsigned long long)
-			       state->max_superblock_generation);
-	} else {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			printk(KERN_INFO
-			       "btrfsic: got new superblock @%llu (%s/%llu/%d)"
-			       " with new gen %llu > %llu\n",
-			       (unsigned long long)superblock->logical_bytenr,
-			       superblock->dev_state->name,
-			       (unsigned long long)superblock->dev_bytenr,
-			       superblock->mirror_num,
-			       (unsigned long long)
-			       btrfs_super_generation(super_hdr),
-			       (unsigned long long)
-			       state->max_superblock_generation);
-
-		state->max_superblock_generation =
-		    btrfs_super_generation(super_hdr);
-		state->latest_superblock = superblock;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		int ret;
-		u64 next_bytenr;
-		struct btrfsic_block *next_block;
-		struct btrfsic_block_data_ctx tmp_next_block_ctx;
-		struct btrfsic_block_link *l;
-		int num_copies;
-		int mirror_num;
-		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key;
-
-		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
-		tmp_disk_key.offset = 0;
-
-		switch (pass) {
-		case 0:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
-			additional_string = "root ";
-			next_bytenr = btrfs_super_root(super_hdr);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "root@%llu\n",
-				       (unsigned long long)next_bytenr);
-			break;
-		case 1:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
-			additional_string = "chunk ";
-			next_bytenr = btrfs_super_chunk_root(super_hdr);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "chunk@%llu\n",
-				       (unsigned long long)next_bytenr);
-			break;
-		case 2:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
-			additional_string = "log ";
-			next_bytenr = btrfs_super_log_root(super_hdr);
-			if (0 == next_bytenr)
-				continue;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "log@%llu\n",
-				       (unsigned long long)next_bytenr);
-			break;
-		}
-
-		num_copies =
-		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-				     next_bytenr, PAGE_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			int was_created;
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				printk(KERN_INFO
-				       "btrfsic_process_written_superblock("
-				       "mirror_num=%d)\n", mirror_num);
-			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
-						&tmp_next_block_ctx,
-						mirror_num);
-			if (ret) {
-				printk(KERN_INFO
-				       "btrfsic: btrfsic_map_block(@%llu,"
-				       " mirror=%d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state,
-					&tmp_next_block_ctx,
-					additional_string,
-					1, 0, 1,
-					mirror_num,
-					&was_created);
-			if (NULL == next_block) {
-				printk(KERN_INFO
-				       "btrfsic: error, kmalloc failed!\n");
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				return -1;
-			}
-
-			next_block->disk_key = tmp_disk_key;
-			if (was_created)
-				next_block->generation =
-				    BTRFSIC_GENERATION_UNKNOWN;
-			l = btrfsic_block_link_lookup_or_add(
-					state,
-					&tmp_next_block_ctx,
-					next_block,
-					superblock,
-					BTRFSIC_GENERATION_UNKNOWN);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-			if (NULL == l)
-				return -1;
-		}
-	}
-
-	if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
-		WARN_ON(1);
-		btrfsic_dump_tree(state);
-	}
-
-	return 0;
-}
-
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
-					struct btrfsic_block *const block,
-					int recursion_level)
-{
-	struct list_head *elem_ref_to;
-	int ret = 0;
-
-	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
-		/*
-		 * Note that this situation can happen and does not
-		 * indicate an error in regular cases. It happens
-		 * when disk blocks are freed and later reused.
-		 * The check-integrity module is not aware of any
-		 * block free operations, it just recognizes block
-		 * write operations. Therefore it keeps the linkage
-		 * information for a block until a block is
-		 * rewritten. This can temporarily cause incorrect
-		 * and even circular linkage informations. This
-		 * causes no harm unless such blocks are referenced
-		 * by the most recent super block.
-		 */
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "btrfsic: abort cyclic linkage (case 1).\n");
-
-		return ret;
-	}
-
-	/*
-	 * This algorithm is recursive because the amount of used stack
-	 * space is very small and the max recursion depth is limited.
-	 */
-	list_for_each(elem_ref_to, &block->ref_to_list) {
-		const struct btrfsic_block_link *const l =
-		    list_entry(elem_ref_to, struct btrfsic_block_link,
-			       node_ref_to);
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "rl=%d, %c @%llu (%s/%llu/%d)"
-			       " %u* refers to %c @%llu (%s/%llu/%d)\n",
-			       recursion_level,
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       block->dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-		if (l->block_ref_to->never_written) {
-			printk(KERN_INFO "btrfs: attempt to write superblock"
-			       " which references block %c @%llu (%s/%llu/%d)"
-			       " which is never written!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (!l->block_ref_to->is_iodone) {
-			printk(KERN_INFO "btrfs: attempt to write superblock"
-			       " which references block %c @%llu (%s/%llu/%d)"
-			       " which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (l->parent_generation !=
-			   l->block_ref_to->generation &&
-			   BTRFSIC_GENERATION_UNKNOWN !=
-			   l->parent_generation &&
-			   BTRFSIC_GENERATION_UNKNOWN !=
-			   l->block_ref_to->generation) {
-			printk(KERN_INFO "btrfs: attempt to write superblock"
-			       " which references block %c @%llu (%s/%llu/%d)"
-			       " with generation %llu !="
-			       " parent generation %llu!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num,
-			       (unsigned long long)l->block_ref_to->generation,
-			       (unsigned long long)l->parent_generation);
-			ret = -1;
-		} else if (l->block_ref_to->flush_gen >
-			   l->block_ref_to->dev_state->last_flush_gen) {
-			printk(KERN_INFO "btrfs: attempt to write superblock"
-			       " which references block %c @%llu (%s/%llu/%d)"
-			       " which is not flushed out of disk's write cache"
-			       " (block flush_gen=%llu,"
-			       " dev->flush_gen=%llu)!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num,
-			       (unsigned long long)block->flush_gen,
-			       (unsigned long long)
-			       l->block_ref_to->dev_state->last_flush_gen);
-			ret = -1;
-		} else if (-1 == btrfsic_check_all_ref_blocks(state,
-							      l->block_ref_to,
-							      recursion_level +
-							      1)) {
-			ret = -1;
-		}
-	}
-
-	return ret;
-}
-
-static int btrfsic_is_block_ref_by_superblock(
-		const struct btrfsic_state *state,
-		const struct btrfsic_block *block,
-		int recursion_level)
-{
-	struct list_head *elem_ref_from;
-
-	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
-		/* refer to comment at "abort cyclic linkage (case 1)" */
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "btrfsic: abort cyclic linkage (case 2).\n");
-
-		return 0;
-	}
-
-	/*
-	 * This algorithm is recursive because the amount of used stack space
-	 * is very small and the max recursion depth is limited.
-	 */
-	list_for_each(elem_ref_from, &block->ref_from_list) {
-		const struct btrfsic_block_link *const l =
-		    list_entry(elem_ref_from, struct btrfsic_block_link,
-			       node_ref_from);
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "rl=%d, %c @%llu (%s/%llu/%d)"
-			       " is ref %u* from %c @%llu (%s/%llu/%d)\n",
-			       recursion_level,
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       block->dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_from),
-			       (unsigned long long)
-			       l->block_ref_from->logical_bytenr,
-			       l->block_ref_from->dev_state->name,
-			       (unsigned long long)
-			       l->block_ref_from->dev_bytenr,
-			       l->block_ref_from->mirror_num);
-		if (l->block_ref_from->is_superblock &&
-		    state->latest_superblock->dev_bytenr ==
-		    l->block_ref_from->dev_bytenr &&
-		    state->latest_superblock->dev_state->bdev ==
-		    l->block_ref_from->dev_state->bdev)
-			return 1;
-		else if (btrfsic_is_block_ref_by_superblock(state,
-							    l->block_ref_from,
-							    recursion_level +
-							    1))
-			return 1;
-	}
-
-	return 0;
-}
-
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l)
-{
-	printk(KERN_INFO
-	       "Add %u* link from %c @%llu (%s/%llu/%d)"
-	       " to %c @%llu (%s/%llu/%d).\n",
-	       l->ref_cnt,
-	       btrfsic_get_block_type(state, l->block_ref_from),
-	       (unsigned long long)l->block_ref_from->logical_bytenr,
-	       l->block_ref_from->dev_state->name,
-	       (unsigned long long)l->block_ref_from->dev_bytenr,
-	       l->block_ref_from->mirror_num,
-	       btrfsic_get_block_type(state, l->block_ref_to),
-	       (unsigned long long)l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->name,
-	       (unsigned long long)l->block_ref_to->dev_bytenr,
-	       l->block_ref_to->mirror_num);
-}
-
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l)
-{
-	printk(KERN_INFO
-	       "Rem %u* link from %c @%llu (%s/%llu/%d)"
-	       " to %c @%llu (%s/%llu/%d).\n",
-	       l->ref_cnt,
-	       btrfsic_get_block_type(state, l->block_ref_from),
-	       (unsigned long long)l->block_ref_from->logical_bytenr,
-	       l->block_ref_from->dev_state->name,
-	       (unsigned long long)l->block_ref_from->dev_bytenr,
-	       l->block_ref_from->mirror_num,
-	       btrfsic_get_block_type(state, l->block_ref_to),
-	       (unsigned long long)l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->name,
-	       (unsigned long long)l->block_ref_to->dev_bytenr,
-	       l->block_ref_to->mirror_num);
-}
-
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
-				   const struct btrfsic_block *block)
-{
-	if (block->is_superblock &&
-	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
-	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
-		return 'S';
-	else if (block->is_superblock)
-		return 's';
-	else if (block->is_metadata)
-		return 'M';
-	else
-		return 'D';
-}
-
-static void btrfsic_dump_tree(const struct btrfsic_state *state)
-{
-	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
-}
-
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
-				  const struct btrfsic_block *block,
-				  int indent_level)
-{
-	struct list_head *elem_ref_to;
-	int indent_add;
-	static char buf[80];
-	int cursor_position;
-
-	/*
-	 * Should better fill an on-stack buffer with a complete line and
-	 * dump it at once when it is time to print a newline character.
-	 */
-
-	/*
-	 * This algorithm is recursive because the amount of used stack space
-	 * is very small and the max recursion depth is limited.
-	 */
-	indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
-			     btrfsic_get_block_type(state, block),
-			     (unsigned long long)block->logical_bytenr,
-			     block->dev_state->name,
-			     (unsigned long long)block->dev_bytenr,
-			     block->mirror_num);
-	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
-		printk("[...]\n");
-		return;
-	}
-	printk(buf);
-	indent_level += indent_add;
-	if (list_empty(&block->ref_to_list)) {
-		printk("\n");
-		return;
-	}
-	if (block->mirror_num > 1 &&
-	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
-		printk(" [...]\n");
-		return;
-	}
-
-	cursor_position = indent_level;
-	list_for_each(elem_ref_to, &block->ref_to_list) {
-		const struct btrfsic_block_link *const l =
-		    list_entry(elem_ref_to, struct btrfsic_block_link,
-			       node_ref_to);
-
-		while (cursor_position < indent_level) {
-			printk(" ");
-			cursor_position++;
-		}
-		if (l->ref_cnt > 1)
-			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
-		else
-			indent_add = sprintf(buf, " --> ");
-		if (indent_level + indent_add >
-		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
-			printk("[...]\n");
-			cursor_position = 0;
-			continue;
-		}
-
-		printk(buf);
-
-		btrfsic_dump_tree_sub(state, l->block_ref_to,
-				      indent_level + indent_add);
-		cursor_position = 0;
-	}
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block *next_block,
-		struct btrfsic_block *from_block,
-		u64 parent_generation)
-{
-	struct btrfsic_block_link *l;
-
-	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
-						next_block_ctx->dev_bytenr,
-						from_block->dev_state->bdev,
-						from_block->dev_bytenr,
-						&state->block_link_hashtable);
-	if (NULL == l) {
-		l = btrfsic_block_link_alloc();
-		if (NULL == l) {
-			printk(KERN_INFO
-			       "btrfsic: error, kmalloc" " failed!\n");
-			return NULL;
-		}
-
-		l->block_ref_to = next_block;
-		l->block_ref_from = from_block;
-		l->ref_cnt = 1;
-		l->parent_generation = parent_generation;
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-
-		list_add(&l->node_ref_to, &from_block->ref_to_list);
-		list_add(&l->node_ref_from, &next_block->ref_from_list);
-
-		btrfsic_block_link_hashtable_add(l,
-						 &state->block_link_hashtable);
-	} else {
-		l->ref_cnt++;
-		l->parent_generation = parent_generation;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-	}
-
-	return l;
-}
-
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *block_ctx,
-		const char *additional_string,
-		int is_metadata,
-		int is_iodone,
-		int never_written,
-		int mirror_num,
-		int *was_created)
-{
-	struct btrfsic_block *block;
-
-	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
-					       block_ctx->dev_bytenr,
-					       &state->block_hashtable);
-	if (NULL == block) {
-		struct btrfsic_dev_state *dev_state;
-
-		block = btrfsic_block_alloc();
-		if (NULL == block) {
-			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-			return NULL;
-		}
-		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
-		if (NULL == dev_state) {
-			printk(KERN_INFO
-			       "btrfsic: error, lookup dev_state failed!\n");
-			btrfsic_block_free(block);
-			return NULL;
-		}
-		block->dev_state = dev_state;
-		block->dev_bytenr = block_ctx->dev_bytenr;
-		block->logical_bytenr = block_ctx->start;
-		block->is_metadata = is_metadata;
-		block->is_iodone = is_iodone;
-		block->never_written = never_written;
-		block->mirror_num = mirror_num;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			printk(KERN_INFO
-			       "New %s%c-block @%llu (%s/%llu/%d)\n",
-			       additional_string,
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       mirror_num);
-		list_add(&block->all_blocks_node, &state->all_blocks_list);
-		btrfsic_block_hashtable_add(block, &state->block_hashtable);
-		if (NULL != was_created)
-			*was_created = 1;
-	} else {
-		if (NULL != was_created)
-			*was_created = 0;
-	}
-
-	return block;
-}
-
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
-					   u64 bytenr,
-					   struct btrfsic_dev_state *dev_state,
-					   u64 dev_bytenr, char *data)
-{
-	int num_copies;
-	int mirror_num;
-	int ret;
-	struct btrfsic_block_data_ctx block_ctx;
-	int match = 0;
-
-	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
-				      bytenr, PAGE_SIZE);
-
-	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-		ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
-					&block_ctx, mirror_num);
-		if (ret) {
-			printk(KERN_INFO "btrfsic:"
-			       " btrfsic_map_block(logical @%llu,"
-			       " mirror %d) failed!\n",
-			       (unsigned long long)bytenr, mirror_num);
-			continue;
-		}
-
-		if (dev_state->bdev == block_ctx.dev->bdev &&
-		    dev_bytenr == block_ctx.dev_bytenr) {
-			match++;
-			btrfsic_release_block_ctx(&block_ctx);
-			break;
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	}
-
-	if (!match) {
-		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
-		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
-		       " phys_bytenr=%llu)!\n",
-		       (unsigned long long)bytenr, dev_state->name,
-		       (unsigned long long)dev_bytenr);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
-						&block_ctx, mirror_num);
-			if (ret)
-				continue;
-
-			printk(KERN_INFO "Read logical bytenr @%llu maps to"
-			       " (%s/%llu/%d)\n",
-			       (unsigned long long)bytenr,
-			       block_ctx.dev->name,
-			       (unsigned long long)block_ctx.dev_bytenr,
-			       mirror_num);
-		}
-		WARN_ON(1);
-	}
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
-		struct block_device *bdev)
-{
-	struct btrfsic_dev_state *ds;
-
-	ds = btrfsic_dev_state_hashtable_lookup(bdev,
-						&btrfsic_dev_state_hashtable);
-	return ds;
-}
-
-int btrfsic_submit_bh(int rw, struct buffer_head *bh)
-{
-	struct btrfsic_dev_state *dev_state;
-
-	if (!btrfsic_is_initialized)
-		return submit_bh(rw, bh);
-
-	mutex_lock(&btrfsic_mutex);
-	/* since btrfsic_submit_bh() might also be called before
-	 * btrfsic_mount(), this might return NULL */
-	dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
-
-	/* Only called to write the superblock (incl. FLUSH/FUA) */
-	if (NULL != dev_state &&
-	    (rw & WRITE) && bh->b_size > 0) {
-		u64 dev_bytenr;
-
-		dev_bytenr = 4096 * bh->b_blocknr;
-		if (dev_state->state->print_mask &
-		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			printk(KERN_INFO
-			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
-			       " size=%lu, data=%p, bdev=%p)\n",
-			       rw, bh->b_blocknr,
-			       (unsigned long long)dev_bytenr, bh->b_size,
-			       bh->b_data, bh->b_bdev);
-		btrfsic_process_written_block(dev_state, dev_bytenr,
-					      bh->b_data, bh->b_size, NULL,
-					      NULL, bh, rw);
-	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
-		if (dev_state->state->print_mask &
-		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			printk(KERN_INFO
-			       "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
-			       rw, bh->b_bdev);
-		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
-			if ((dev_state->state->print_mask &
-			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-			      BTRFSIC_PRINT_MASK_VERBOSE)))
-				printk(KERN_INFO
-				       "btrfsic_submit_bh(%s) with FLUSH"
-				       " but dummy block already in use"
-				       " (ignored)!\n",
-				       dev_state->name);
-		} else {
-			struct btrfsic_block *const block =
-				&dev_state->dummy_block_for_bio_bh_flush;
-
-			block->is_iodone = 0;
-			block->never_written = 0;
-			block->iodone_w_error = 0;
-			block->flush_gen = dev_state->last_flush_gen + 1;
-			block->submit_bio_bh_rw = rw;
-			block->orig_bio_bh_private = bh->b_private;
-			block->orig_bio_bh_end_io.bh = bh->b_end_io;
-			block->next_in_same_bio = NULL;
-			bh->b_private = block;
-			bh->b_end_io = btrfsic_bh_end_io;
-		}
-	}
-	mutex_unlock(&btrfsic_mutex);
-	return submit_bh(rw, bh);
-}
-
-void btrfsic_submit_bio(int rw, struct bio *bio)
-{
-	struct btrfsic_dev_state *dev_state;
-
-	if (!btrfsic_is_initialized) {
-		submit_bio(rw, bio);
-		return;
-	}
-
-	mutex_lock(&btrfsic_mutex);
-	/* since btrfsic_submit_bio() is also called before
-	 * btrfsic_mount(), this might return NULL */
-	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
-	if (NULL != dev_state &&
-	    (rw & WRITE) && NULL != bio->bi_io_vec) {
-		unsigned int i;
-		u64 dev_bytenr;
-		int bio_is_patched;
-
-		dev_bytenr = 512 * bio->bi_sector;
-		bio_is_patched = 0;
-		if (dev_state->state->print_mask &
-		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			printk(KERN_INFO
-			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
-			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
-			       rw, bio->bi_vcnt, bio->bi_sector,
-			       (unsigned long long)dev_bytenr,
-			       bio->bi_bdev);
-
-		for (i = 0; i < bio->bi_vcnt; i++) {
-			u8 *mapped_data;
-
-			mapped_data = kmap(bio->bi_io_vec[i].bv_page);
-			if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-			     BTRFSIC_PRINT_MASK_VERBOSE) ==
-			    (dev_state->state->print_mask &
-			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-			      BTRFSIC_PRINT_MASK_VERBOSE)))
-				printk(KERN_INFO
-				       "#%u: page=%p, mapped=%p, len=%u,"
-				       " offset=%u\n",
-				       i, bio->bi_io_vec[i].bv_page,
-				       mapped_data,
-				       bio->bi_io_vec[i].bv_len,
-				       bio->bi_io_vec[i].bv_offset);
-			btrfsic_process_written_block(dev_state, dev_bytenr,
-						      mapped_data,
-						      bio->bi_io_vec[i].bv_len,
-						      bio, &bio_is_patched,
-						      NULL, rw);
-			kunmap(bio->bi_io_vec[i].bv_page);
-			dev_bytenr += bio->bi_io_vec[i].bv_len;
-		}
-	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
-		if (dev_state->state->print_mask &
-		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			printk(KERN_INFO
-			       "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
-			       rw, bio->bi_bdev);
-		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
-			if ((dev_state->state->print_mask &
-			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-			      BTRFSIC_PRINT_MASK_VERBOSE)))
-				printk(KERN_INFO
-				       "btrfsic_submit_bio(%s) with FLUSH"
-				       " but dummy block already in use"
-				       " (ignored)!\n",
-				       dev_state->name);
-		} else {
-			struct btrfsic_block *const block =
-				&dev_state->dummy_block_for_bio_bh_flush;
-
-			block->is_iodone = 0;
-			block->never_written = 0;
-			block->iodone_w_error = 0;
-			block->flush_gen = dev_state->last_flush_gen + 1;
-			block->submit_bio_bh_rw = rw;
-			block->orig_bio_bh_private = bio->bi_private;
-			block->orig_bio_bh_end_io.bio = bio->bi_end_io;
-			block->next_in_same_bio = NULL;
-			bio->bi_private = block;
-			bio->bi_end_io = btrfsic_bio_end_io;
-		}
-	}
-	mutex_unlock(&btrfsic_mutex);
-
-	submit_bio(rw, bio);
-}
-
-int btrfsic_mount(struct btrfs_root *root,
-		  struct btrfs_fs_devices *fs_devices,
-		  int including_extent_data, u32 print_mask)
-{
-	int ret;
-	struct btrfsic_state *state;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-
-	state = kzalloc(sizeof(*state), GFP_NOFS);
-	if (NULL == state) {
-		printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
-		return -1;
-	}
-
-	if (!btrfsic_is_initialized) {
-		mutex_init(&btrfsic_mutex);
-		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
-		btrfsic_is_initialized = 1;
-	}
-	mutex_lock(&btrfsic_mutex);
-	state->root = root;
-	state->print_mask = print_mask;
-	state->include_extent_data = including_extent_data;
-	state->csum_size = 0;
-	INIT_LIST_HEAD(&state->all_blocks_list);
-	btrfsic_block_hashtable_init(&state->block_hashtable);
-	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
-	state->max_superblock_generation = 0;
-	state->latest_superblock = NULL;
-
-	list_for_each_entry(device, dev_head, dev_list) {
-		struct btrfsic_dev_state *ds;
-		char *p;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		ds = btrfsic_dev_state_alloc();
-		if (NULL == ds) {
-			printk(KERN_INFO
-			       "btrfs check-integrity: kmalloc() failed!\n");
-			mutex_unlock(&btrfsic_mutex);
-			return -1;
-		}
-		ds->bdev = device->bdev;
-		ds->state = state;
-		bdevname(ds->bdev, ds->name);
-		ds->name[BDEVNAME_SIZE - 1] = '\0';
-		for (p = ds->name; *p != '\0'; p++);
-		while (p > ds->name && *p != '/')
-			p--;
-		if (*p == '/')
-			p++;
-		strlcpy(ds->name, p, sizeof(ds->name));
-		btrfsic_dev_state_hashtable_add(ds,
-						&btrfsic_dev_state_hashtable);
-	}
-
-	ret = btrfsic_process_superblock(state, fs_devices);
-	if (0 != ret) {
-		mutex_unlock(&btrfsic_mutex);
-		btrfsic_unmount(root, fs_devices);
-		return ret;
-	}
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
-		btrfsic_dump_database(state);
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
-		btrfsic_dump_tree(state);
-
-	mutex_unlock(&btrfsic_mutex);
-	return 0;
-}
-
-void btrfsic_unmount(struct btrfs_root *root,
-		     struct btrfs_fs_devices *fs_devices)
-{
-	struct list_head *elem_all;
-	struct list_head *tmp_all;
-	struct btrfsic_state *state;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-
-	if (!btrfsic_is_initialized)
-		return;
-
-	mutex_lock(&btrfsic_mutex);
-
-	state = NULL;
-	list_for_each_entry(device, dev_head, dev_list) {
-		struct btrfsic_dev_state *ds;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		ds = btrfsic_dev_state_hashtable_lookup(
-				device->bdev,
-				&btrfsic_dev_state_hashtable);
-		if (NULL != ds) {
-			state = ds->state;
-			btrfsic_dev_state_hashtable_remove(ds);
-			btrfsic_dev_state_free(ds);
-		}
-	}
-
-	if (NULL == state) {
-		printk(KERN_INFO
-		       "btrfsic: error, cannot find state information"
-		       " on umount!\n");
-		mutex_unlock(&btrfsic_mutex);
-		return;
-	}
-
-	/*
-	 * Don't care about keeping the lists' state up to date,
-	 * just free all memory that was allocated dynamically.
-	 * Free the blocks and the block_links.
-	 */
-	list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
-		struct btrfsic_block *const b_all =
-		    list_entry(elem_all, struct btrfsic_block,
-			       all_blocks_node);
-		struct list_head *elem_ref_to;
-		struct list_head *tmp_ref_to;
-
-		list_for_each_safe(elem_ref_to, tmp_ref_to,
-				   &b_all->ref_to_list) {
-			struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_to,
-				       struct btrfsic_block_link,
-				       node_ref_to);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_rem_link(state, l);
-
-			l->ref_cnt--;
-			if (0 == l->ref_cnt)
-				btrfsic_block_link_free(l);
-		}
-
-		if (b_all->is_iodone)
-			btrfsic_block_free(b_all);
-		else
-			printk(KERN_INFO "btrfs: attempt to free %c-block"
-			       " @%llu (%s/%llu/%d) on umount which is"
-			       " not yet iodone!\n",
-			       btrfsic_get_block_type(state, b_all),
-			       (unsigned long long)b_all->logical_bytenr,
-			       b_all->dev_state->name,
-			       (unsigned long long)b_all->dev_bytenr,
-			       b_all->mirror_num);
-	}
-
-	mutex_unlock(&btrfsic_mutex);
-
-	kfree(state);
-}
diff --git a/trunk/fs/btrfs/check-integrity.h b/trunk/fs/btrfs/check-integrity.h
deleted file mode 100644
index 8b59175cc502..000000000000
--- a/trunk/fs/btrfs/check-integrity.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) STRATO AG 2011.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#if !defined(__BTRFS_CHECK_INTEGRITY__)
-#define __BTRFS_CHECK_INTEGRITY__
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-int btrfsic_submit_bh(int rw, struct buffer_head *bh);
-void btrfsic_submit_bio(int rw, struct bio *bio);
-#else
-#define btrfsic_submit_bh submit_bh
-#define btrfsic_submit_bio submit_bio
-#endif
-
-int btrfsic_mount(struct btrfs_root *root,
-		  struct btrfs_fs_devices *fs_devices,
-		  int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_root *root,
-		     struct btrfs_fs_devices *fs_devices);
-
-#endif
diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c
index 0639a555e16e..dede441bdeee 100644
--- a/trunk/fs/btrfs/ctree.c
+++ b/trunk/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
 				     new_root_objectid, &disk_key, level,
-				     buf->start, 0, 1);
+				     buf->start, 0);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+		ret = btrfs_inc_ref(trans, root, cow, 1);
 	else
-		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+		ret = btrfs_inc_ref(trans, root, cow, 0);
 
 	if (ret)
 		return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if ((owner == root->root_key.objectid ||
 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+			ret = btrfs_inc_ref(trans, root, buf, 1);
 			BUG_ON(ret);
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID) {
-				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+				ret = btrfs_dec_ref(trans, root, buf, 0);
 				BUG_ON(ret);
-				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1);
 				BUG_ON(ret);
 			}
 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 0);
 			BUG_ON(ret);
 		}
 		if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 0);
 			BUG_ON(ret);
-			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+			ret = btrfs_dec_ref(trans, root, buf, 1);
 			BUG_ON(ret);
 		}
 		clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
 				     root->root_key.objectid, &disk_key,
-				     level, search_start, empty_size, 1);
+				     level, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref, 1);
+				      last_ref);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
 		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref, 1);
+				      last_ref);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			root_sub_used(root, right->len);
-			btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+			btrfs_free_tree_block(trans, root, right, 0, 1);
 			free_extent_buffer(right);
 			right = NULL;
 		} else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		free_extent_buffer(mid);
 		mid = NULL;
 	} else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 				   root->root_key.objectid, &lower_key,
-				   level, root->node->start, 0, 0);
+				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 					root->root_key.objectid,
-					&disk_key, level, c->start, 0, 0);
+					&disk_key, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -2970,7 +2970,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 					root->root_key.objectid,
-					&disk_key, 0, l->start, 0, 0);
+					&disk_key, 0, l->start, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
 	root_sub_used(root, leaf->len);
 
-	btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+	btrfs_free_tree_block(trans, root, leaf, 0, 1);
 	return 0;
 }
 /*
diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h
index 27ebe61d3ccc..67385033323d 100644
--- a/trunk/fs/btrfs/ctree.h
+++ b/trunk/fs/btrfs/ctree.h
@@ -86,9 +86,6 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -695,54 +692,6 @@ struct btrfs_root_ref {
 	__le16 name_len;
 } __attribute__ ((__packed__));
 
-struct btrfs_disk_balance_args {
-	/*
-	 * profiles to operate on, single is denoted by
-	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
-	 */
-	__le64 profiles;
-
-	/* usage filter */
-	__le64 usage;
-
-	/* devid filter */
-	__le64 devid;
-
-	/* devid subset filter [pstart..pend) */
-	__le64 pstart;
-	__le64 pend;
-
-	/* btrfs virtual address space subset filter [vstart..vend) */
-	__le64 vstart;
-	__le64 vend;
-
-	/*
-	 * profile to convert to, single is denoted by
-	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
-	 */
-	__le64 target;
-
-	/* BTRFS_BALANCE_ARGS_* */
-	__le64 flags;
-
-	__le64 unused[8];
-} __attribute__ ((__packed__));
-
-/*
- * store balance parameters to disk so that balance can be properly
- * resumed after crash or unmount
- */
-struct btrfs_balance_item {
-	/* BTRFS_BALANCE_* */
-	__le64 flags;
-
-	struct btrfs_disk_balance_args data;
-	struct btrfs_disk_balance_args meta;
-	struct btrfs_disk_balance_args sys;
-
-	__le64 unused[4];
-} __attribute__ ((__packed__));
-
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -802,32 +751,14 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
-#define BTRFS_BLOCK_GROUP_METADATA	(1ULL << 2)
-#define BTRFS_BLOCK_GROUP_RAID0		(1ULL << 3)
-#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
-#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
-#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES		5
-
-#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
-					 BTRFS_BLOCK_GROUP_SYSTEM |  \
-					 BTRFS_BLOCK_GROUP_METADATA)
-
-#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
-					 BTRFS_BLOCK_GROUP_RAID1 |   \
-					 BTRFS_BLOCK_GROUP_DUP |     \
-					 BTRFS_BLOCK_GROUP_RAID10)
-/*
- * We need a bit for restriper to be able to tell when chunks of type
- * SINGLE are available.  This "extended" profile format is used in
- * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
- * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
- * to avoid remappings between two formats in future.
- */
-#define BTRFS_AVAIL_ALLOC_BIT_SINGLE	(1ULL << 48)
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES	   5
 
 struct btrfs_block_group_item {
 	__le64 used;
@@ -985,7 +916,6 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
-struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
@@ -1041,7 +971,7 @@ struct btrfs_fs_info {
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	unsigned long mount_opt:21;
+	unsigned long mount_opt:20;
 	unsigned long compress_type:4;
 	u64 max_inline;
 	u64 alloc_start;
@@ -1202,23 +1132,12 @@ struct btrfs_fs_info {
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
 
-	/*
-	 * these three are in extended format (availability of single
-	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
-	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
-	 */
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
-
-	/* restriper state */
-	spinlock_t balance_lock;
-	struct mutex balance_mutex;
-	atomic_t balance_running;
-	atomic_t balance_pause_req;
-	atomic_t balance_cancel_req;
-	struct btrfs_balance_control *balance_ctl;
-	wait_queue_head_t balance_wait_q;
+	u64 data_alloc_profile;
+	u64 metadata_alloc_profile;
+	u64 system_alloc_profile;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
@@ -1236,10 +1155,6 @@ struct btrfs_fs_info {
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	u32 check_integrity_print_mask;
-#endif
-
 	/* filesystem state */
 	u64 fs_state;
 
@@ -1468,8 +1383,6 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
 
-#define BTRFS_BALANCE_ITEM_KEY	248
-
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1500,9 +1413,6 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 #define BTRFS_MOUNT_RECOVERY		(1 << 18)
-#define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
-#define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
-#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -2167,86 +2077,8 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 		   num_devices, 64);
 
-/* struct btrfs_balance_item */
-BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
-
-static inline void btrfs_balance_data(struct extent_buffer *eb,
-				      struct btrfs_balance_item *bi,
-				      struct btrfs_disk_balance_args *ba)
-{
-	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_set_balance_data(struct extent_buffer *eb,
-					  struct btrfs_balance_item *bi,
-					  struct btrfs_disk_balance_args *ba)
-{
-	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_balance_meta(struct extent_buffer *eb,
-				      struct btrfs_balance_item *bi,
-				      struct btrfs_disk_balance_args *ba)
-{
-	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
-					  struct btrfs_balance_item *bi,
-					  struct btrfs_disk_balance_args *ba)
-{
-	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_balance_sys(struct extent_buffer *eb,
-				     struct btrfs_balance_item *bi,
-				     struct btrfs_disk_balance_args *ba)
-{
-	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
-					 struct btrfs_balance_item *bi,
-					 struct btrfs_disk_balance_args *ba)
-{
-	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void
-btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
-			       struct btrfs_disk_balance_args *disk)
-{
-	memset(cpu, 0, sizeof(*cpu));
-
-	cpu->profiles = le64_to_cpu(disk->profiles);
-	cpu->usage = le64_to_cpu(disk->usage);
-	cpu->devid = le64_to_cpu(disk->devid);
-	cpu->pstart = le64_to_cpu(disk->pstart);
-	cpu->pend = le64_to_cpu(disk->pend);
-	cpu->vstart = le64_to_cpu(disk->vstart);
-	cpu->vend = le64_to_cpu(disk->vend);
-	cpu->target = le64_to_cpu(disk->target);
-	cpu->flags = le64_to_cpu(disk->flags);
-}
-
-static inline void
-btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
-			       struct btrfs_balance_args *cpu)
-{
-	memset(disk, 0, sizeof(*disk));
-
-	disk->profiles = cpu_to_le64(cpu->profiles);
-	disk->usage = cpu_to_le64(cpu->usage);
-	disk->devid = cpu_to_le64(cpu->devid);
-	disk->pstart = cpu_to_le64(cpu->pstart);
-	disk->pend = cpu_to_le64(cpu->pend);
-	disk->vstart = cpu_to_le64(cpu->vstart);
-	disk->vend = cpu_to_le64(cpu->vend);
-	disk->target = cpu_to_le64(cpu->target);
-	disk->flags = cpu_to_le64(cpu->flags);
-}
-
 /* struct btrfs_super_block */
+
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2364,7 +2196,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 	return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
@@ -2445,11 +2277,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size, int for_cow);
+					u64 hint, u64 empty_size);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   u64 parent, int last_ref, int for_cow);
+			   u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
@@ -2469,17 +2301,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref, int for_cow);
+		  struct extent_buffer *buf, int full_backref);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref, int for_cow);
+		  struct extent_buffer *buf, int full_backref);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
 				int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int for_cow);
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2491,7 +2323,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset, int for_cow);
+			 u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
@@ -2650,18 +2482,10 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
-{
-	++p->slots[0];
-	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
-		return btrfs_next_leaf(root, p);
-	return 0;
-}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref,
-			 int for_reloc);
+			 struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
@@ -2676,7 +2500,6 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
-	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
@@ -2687,24 +2510,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->super_for_commit);
 	kfree(fs_info);
 }
-/**
- * profile_is_valid - tests whether a given profile is valid and reduced
- * @flags: profile to validate
- * @extended: if true @flags is treated as an extended profile
- */
-static inline int profile_is_valid(u64 flags, int extended)
-{
-	u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
-	if (extended)
-		mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
-	if (flags & mask)
-		return 0;
-	/* true if zero or exactly one bit set */
-	return (flags & (~flags + 1)) == flags;
-}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/trunk/fs/btrfs/delayed-inode.c b/trunk/fs/btrfs/delayed-inode.c
index fe4cd0f1cef1..9c1eccc2c503 100644
--- a/trunk/fs/btrfs/delayed-inode.c
+++ b/trunk/fs/btrfs/delayed-inode.c
@@ -595,12 +595,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (!ret) {
-		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
-					      item->key.objectid,
-					      num_bytes, 1);
+	if (!ret)
 		item->bytes_reserved = num_bytes;
-	}
 
 	return ret;
 }
@@ -614,9 +610,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
-	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
-				      item->key.objectid, item->bytes_reserved,
-				      0);
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -631,7 +624,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
-	bool release = false;
+	int release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -658,13 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 		 */
 		if (ret == -EAGAIN)
 			ret = -ENOSPC;
-		if (!ret) {
+		if (!ret)
 			node->bytes_reserved = num_bytes;
-			trace_btrfs_space_reservation(root->fs_info,
-						      "delayed_inode",
-						      btrfs_ino(inode),
-						      num_bytes, 1);
-		}
 		return ret;
 	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
 		spin_lock(&BTRFS_I(inode)->lock);
@@ -719,17 +707,11 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 * reservation here.  I think it may be time for a documentation page on
 	 * how block rsvs. work.
 	 */
-	if (!ret) {
-		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
-					      btrfs_ino(inode), num_bytes, 1);
+	if (!ret)
 		node->bytes_reserved = num_bytes;
-	}
 
-	if (release) {
-		trace_btrfs_space_reservation(root->fs_info, "delalloc",
-					      btrfs_ino(inode), num_bytes, 0);
+	if (release)
 		btrfs_block_rsv_release(root, src_rsv, num_bytes);
-	}
 
 	return ret;
 }
@@ -743,8 +725,6 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
-	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
-				      node->inode_id, node->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -1392,6 +1372,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 		goto release_node;
 	}
 
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
 	delayed_item->key.objectid = btrfs_ino(dir);
 	btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
 	delayed_item->key.offset = index;
@@ -1404,14 +1391,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	dir_item->type = type;
 	memcpy((char *)(dir_item + 1), name, name_len);
 
-	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-	/*
-	 * we have reserved enough space when we start a new transaction,
-	 * so reserving metadata failure is impossible
-	 */
-	BUG_ON(ret);
-
-
 	mutex_lock(&delayed_node->mutex);
 	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
 	if (unlikely(ret)) {
diff --git a/trunk/fs/btrfs/delayed-ref.c b/trunk/fs/btrfs/delayed-ref.c
index 66e4f29505a3..125cf76fcd08 100644
--- a/trunk/fs/btrfs/delayed-ref.c
+++ b/trunk/fs/btrfs/delayed-ref.c
@@ -101,11 +101,6 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 		return -1;
 	if (ref1->type > ref2->type)
 		return 1;
-	/* merging of sequenced refs is not allowed */
-	if (ref1->seq < ref2->seq)
-		return -1;
-	if (ref1->seq > ref2->seq)
-		return 1;
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -155,22 +150,16 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
+ * head if it was able to find one, or NULL if nothing was in that spot
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 				  u64 bytenr,
-				  struct btrfs_delayed_ref_node **last,
-				  int return_bigger)
+				  struct btrfs_delayed_ref_node **last)
 {
-	struct rb_node *n;
+	struct rb_node *n = root->rb_node;
 	struct btrfs_delayed_ref_node *entry;
-	int cmp = 0;
+	int cmp;
 
-again:
-	n = root->rb_node;
-	entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 		WARN_ON(!entry->in_tree);
@@ -193,19 +182,6 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 		else
 			return entry;
 	}
-	if (entry && return_bigger) {
-		if (cmp > 0) {
-			n = rb_next(&entry->rb_node);
-			if (!n)
-				n = rb_first(root);
-			entry = rb_entry(n, struct btrfs_delayed_ref_node,
-					 rb_node);
-			bytenr = entry->bytenr;
-			return_bigger = 0;
-			goto again;
-		}
-		return entry;
-	}
 	return NULL;
 }
 
@@ -233,24 +209,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-			    u64 seq)
-{
-	struct seq_list *elem;
-
-	assert_spin_locked(&delayed_refs->lock);
-	if (list_empty(&delayed_refs->seq_head))
-		return 0;
-
-	elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
-	if (seq >= elem->seq) {
-		pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
-			 seq, elem->seq, delayed_refs);
-		return 1;
-	}
-	return 0;
-}
-
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 start)
 {
@@ -265,8 +223,20 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
+		find_ref_head(&delayed_refs->root, start, &ref);
 		if (ref) {
+			struct btrfs_delayed_ref_node *tmp;
+
+			node = rb_prev(&ref->rb_node);
+			while (node) {
+				tmp = rb_entry(node,
+					       struct btrfs_delayed_ref_node,
+					       rb_node);
+				if (tmp->bytenr < start)
+					break;
+				ref = tmp;
+				node = rb_prev(&ref->rb_node);
+			}
 			node = &ref->rb_node;
 		} else
 			node = rb_first(&delayed_refs->root);
@@ -420,8 +390,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
-					struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 					struct btrfs_delayed_ref_node *ref,
 					u64 bytenr, u64 num_bytes,
 					int action, int is_data)
@@ -468,7 +437,6 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	ref->action  = 0;
 	ref->is_head = 1;
 	ref->in_tree = 1;
-	ref->seq = 0;
 
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
@@ -500,17 +468,14 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-					 struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action,
-					 int for_cow)
+					 u64 ref_root, int level, int action)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -526,17 +491,14 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
-	if (need_ref_seq(for_cow, ref_root))
-		seq = inc_delayed_seq(delayed_refs);
-	ref->seq = seq;
-
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	full_ref->parent = parent;
-	full_ref->root = ref_root;
-	if (parent)
+	if (parent) {
+		full_ref->parent = parent;
 		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-	else
+	} else {
+		full_ref->root = ref_root;
 		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
+	}
 	full_ref->level = level;
 
 	trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -560,17 +522,15 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-					 struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
 					 u64 ref_root, u64 owner, u64 offset,
-					 int action, int for_cow)
+					 int action)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -586,18 +546,14 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
-	if (need_ref_seq(for_cow, ref_root))
-		seq = inc_delayed_seq(delayed_refs);
-	ref->seq = seq;
-
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	full_ref->parent = parent;
-	full_ref->root = ref_root;
-	if (parent)
+	if (parent) {
+		full_ref->parent = parent;
 		ref->type = BTRFS_SHARED_DATA_REF_KEY;
-	else
+	} else {
+		full_ref->root = ref_root;
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-
+	}
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
 
@@ -624,12 +580,10 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-			       struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int for_cow)
+			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
@@ -656,17 +610,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, action, 0);
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 0);
 	BUG_ON(ret);
 
-	ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
-				   num_bytes, parent, ref_root, level, action,
-				   for_cow);
+	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, level, action);
 	BUG_ON(ret);
-	if (!need_ref_seq(for_cow, ref_root) &&
-	    waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -674,13 +624,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-			       struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int for_cow)
+			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
@@ -707,23 +655,18 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, action, 1);
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 1);
 	BUG_ON(ret);
 
-	ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
-				   num_bytes, parent, ref_root, owner, offset,
-				   action, for_cow);
+	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, owner, offset, action);
 	BUG_ON(ret);
-	if (!need_ref_seq(for_cow, ref_root) &&
-	    waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
-				struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op)
 {
@@ -740,13 +683,11 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
 	BUG_ON(ret);
 
-	if (waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -763,7 +704,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
diff --git a/trunk/fs/btrfs/delayed-ref.h b/trunk/fs/btrfs/delayed-ref.h
index d8f244d94925..e287e3b0eab0 100644
--- a/trunk/fs/btrfs/delayed-ref.h
+++ b/trunk/fs/btrfs/delayed-ref.h
@@ -33,9 +33,6 @@ struct btrfs_delayed_ref_node {
 	/* the size of the extent */
 	u64 num_bytes;
 
-	/* seq number to keep track of insertion order */
-	u64 seq;
-
 	/* ref count on this data structure */
 	atomic_t refs;
 
@@ -101,15 +98,19 @@ struct btrfs_delayed_ref_head {
 
 struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
-	u64 root;
-	u64 parent;
+	union {
+		u64 root;
+		u64 parent;
+	};
 	int level;
 };
 
 struct btrfs_delayed_data_ref {
 	struct btrfs_delayed_ref_node node;
-	u64 root;
-	u64 parent;
+	union {
+		u64 root;
+		u64 parent;
+	};
 	u64 objectid;
 	u64 offset;
 };
@@ -139,26 +140,6 @@ struct btrfs_delayed_ref_root {
 	int flushing;
 
 	u64 run_delayed_start;
-
-	/*
-	 * seq number of delayed refs. We need to know if a backref was being
-	 * added before the currently processed ref or afterwards.
-	 */
-	u64 seq;
-
-	/*
-	 * seq_list holds a list of all seq numbers that are currently being
-	 * added to the list. While walking backrefs (btrfs_find_all_roots,
-	 * qgroups), which might take some time, no newer ref must be processed,
-	 * as it might influence the outcome of the walk.
-	 */
-	struct list_head seq_head;
-
-	/*
-	 * when the only refs we have in the list must not be processed, we want
-	 * to wait for more refs to show up or for the end of backref walking.
-	 */
-	wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -170,21 +151,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 	}
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-			       struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int for_cow);
-int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-			       struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int for_cow);
-int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
-				struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
 
@@ -194,60 +170,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 search_start);
-
-struct seq_list {
-	struct list_head list;
-	u64 seq;
-};
-
-static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
-{
-	assert_spin_locked(&delayed_refs->lock);
-	++delayed_refs->seq;
-	return delayed_refs->seq;
-}
-
-static inline void
-btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-		      struct seq_list *elem)
-{
-	assert_spin_locked(&delayed_refs->lock);
-	elem->seq = delayed_refs->seq;
-	list_add_tail(&elem->list, &delayed_refs->seq_head);
-}
-
-static inline void
-btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-		      struct seq_list *elem)
-{
-	spin_lock(&delayed_refs->lock);
-	list_del(&elem->list);
-	wake_up(&delayed_refs->seq_wait);
-	spin_unlock(&delayed_refs->lock);
-}
-
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-			    u64 seq);
-
-/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
-	if (for_cow)
-		return 0;
-
-	if (rootid == BTRFS_FS_TREE_OBJECTID)
-		return 1;
-
-	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
-		return 1;
-
-	return 0;
-}
-
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c
index 7aa9cd36bf1b..d8525662ca7a 100644
--- a/trunk/fs/btrfs/disk-io.c
+++ b/trunk/fs/btrfs/disk-io.c
@@ -43,7 +43,6 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
-#include "check-integrity.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -1144,6 +1143,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->orphan_item_inserted = 0;
 	root->orphan_cleanup_state = 0;
 
+	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
@@ -1217,14 +1217,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
-	if (root)
-		root->fs_info = fs_info;
-	return root;
-}
-
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -1232,7 +1224,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct extent_buffer *leaf;
 
-	root = btrfs_alloc_root(fs_info);
+	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
@@ -1252,8 +1244,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->ref_cows = 0;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID, NULL,
-				      0, 0, 0, 0);
+				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
@@ -1327,7 +1318,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	u32 blocksize;
 	int ret = 0;
 
-	root = btrfs_alloc_root(fs_info);
+	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 	if (location->offset == (u64)-1) {
@@ -1883,9 +1874,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 }
 
 
-int open_ctree(struct super_block *sb,
-	       struct btrfs_fs_devices *fs_devices,
-	       char *options)
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -1897,8 +1888,8 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *tree_root;
+	struct btrfs_root *tree_root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
@@ -1909,14 +1900,16 @@ int open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
-	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
-	extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
-	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
-	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
-	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+	extent_root = fs_info->extent_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	csum_root = fs_info->csum_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	chunk_root = fs_info->chunk_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	dev_root = fs_info->dev_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
 
-	if (!tree_root || !extent_root || !csum_root ||
-	    !chunk_root || !dev_root) {
+	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -2005,17 +1998,6 @@ int open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
 	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	fs_info->check_integrity_print_mask = 0;
-#endif
-
-	spin_lock_init(&fs_info->balance_lock);
-	mutex_init(&fs_info->balance_mutex);
-	atomic_set(&fs_info->balance_running, 0);
-	atomic_set(&fs_info->balance_pause_req, 0);
-	atomic_set(&fs_info->balance_cancel_req, 0);
-	fs_info->balance_ctl = NULL;
-	init_waitqueue_head(&fs_info->balance_wait_q);
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -2285,7 +2267,9 @@ int open_ctree(struct super_block *sb,
 	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
 	   BTRFS_UUID_SIZE);
 
+	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
+	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
@@ -2334,6 +2318,9 @@ int open_ctree(struct super_block *sb,
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
+	fs_info->data_alloc_profile = (u64)-1;
+	fs_info->metadata_alloc_profile = (u64)-1;
+	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
@@ -2366,19 +2353,6 @@ int open_ctree(struct super_block *sb,
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
-		ret = btrfsic_mount(tree_root, fs_devices,
-				    btrfs_test_opt(tree_root,
-					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
-				    1 : 0,
-				    fs_info->check_integrity_print_mask);
-		if (ret)
-			printk(KERN_WARNING "btrfs: failed to initialize"
-			       " integrity check module %s\n", sb->s_id);
-	}
-#endif
-
 	/* do not make disk changes in broken FS */
 	if (btrfs_super_log_root(disk_super) != 0 &&
 	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2394,7 +2368,7 @@ int open_ctree(struct super_block *sb,
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = btrfs_alloc_root(fs_info);
+		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
 		if (!log_tree_root) {
 			err = -ENOMEM;
 			goto fail_trans_kthread;
@@ -2449,17 +2423,13 @@ int open_ctree(struct super_block *sb,
 		if (!err)
 			err = btrfs_orphan_cleanup(fs_info->tree_root);
 		up_read(&fs_info->cleanup_work_sem);
-
-		if (!err)
-			err = btrfs_recover_balance(fs_info->tree_root);
-
 		if (err) {
 			close_ctree(tree_root);
-			return err;
+			return ERR_PTR(err);
 		}
 	}
 
-	return 0;
+	return tree_root;
 
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
@@ -2505,7 +2475,8 @@ int open_ctree(struct super_block *sb,
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
-	return err;
+	free_fs_info(fs_info);
+	return ERR_PTR(err);
 
 recovery_tree_root:
 	if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2660,7 +2631,7 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
 		 */
-		ret = btrfsic_submit_bh(WRITE_FUA, bh);
+		ret = submit_bh(WRITE_FUA, bh);
 		if (ret)
 			errors++;
 	}
@@ -2737,7 +2708,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	device->flush_bio = bio;
 
 	bio_get(bio);
-	btrfsic_submit_bio(WRITE_FLUSH, bio);
+	submit_bio(WRITE_FLUSH, bio);
 
 	return 0;
 }
@@ -3001,9 +2972,6 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
-	/* pause restriper - we want to resume on mount */
-	btrfs_pause_balance(root->fs_info);
-
 	btrfs_scrub_cancel(root);
 
 	/* wait for any defraggers to finish */
@@ -3011,7 +2979,7 @@ int close_ctree(struct btrfs_root *root)
 		   (atomic_read(&fs_info->defrag_running) == 0));
 
 	/* clear out the rbtree of defraggable inodes */
-	btrfs_run_defrag_inodes(fs_info);
+	btrfs_run_defrag_inodes(root->fs_info);
 
 	/*
 	 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3040,8 +3008,8 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_put_block_group_cache(fs_info);
 
-	kthread_stop(fs_info->transaction_kthread);
-	kthread_stop(fs_info->cleaner_kthread);
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
 
 	fs_info->closing = 2;
 	smp_mb();
@@ -3059,14 +3027,14 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(fs_info->extent_root->commit_root);
 	free_extent_buffer(fs_info->tree_root->node);
 	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(fs_info->chunk_root->node);
-	free_extent_buffer(fs_info->chunk_root->commit_root);
-	free_extent_buffer(fs_info->dev_root->node);
-	free_extent_buffer(fs_info->dev_root->commit_root);
-	free_extent_buffer(fs_info->csum_root->node);
-	free_extent_buffer(fs_info->csum_root->commit_root);
+	free_extent_buffer(root->fs_info->chunk_root->node);
+	free_extent_buffer(root->fs_info->chunk_root->commit_root);
+	free_extent_buffer(root->fs_info->dev_root->node);
+	free_extent_buffer(root->fs_info->dev_root->commit_root);
+	free_extent_buffer(root->fs_info->csum_root->node);
+	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
-	btrfs_free_block_groups(fs_info);
+	btrfs_free_block_groups(root->fs_info);
 
 	del_fs_roots(fs_info);
 
@@ -3086,17 +3054,14 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(root, CHECK_INTEGRITY))
-		btrfsic_unmount(root, fs_info->fs_devices);
-#endif
-
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
+	free_fs_info(fs_info);
+
 	return 0;
 }
 
diff --git a/trunk/fs/btrfs/disk-io.h b/trunk/fs/btrfs/disk-io.h
index e4bc4741319b..c99d0a8f13fa 100644
--- a/trunk/fs/btrfs/disk-io.h
+++ b/trunk/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
-int open_ctree(struct super_block *sb,
-	       struct btrfs_fs_devices *fs_devices,
-	       char *options);
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
diff --git a/trunk/fs/btrfs/export.c b/trunk/fs/btrfs/export.c
index 5f77166fd01c..1b8dc33778f9 100644
--- a/trunk/fs/btrfs/export.c
+++ b/trunk/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 				       u64 root_objectid, u32 generation,
 				       int check_generation)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
 	struct btrfs_root *root;
 	struct inode *inode;
 	struct btrfs_key key;
diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c
index 700879ed64cf..f5fbe576d2ba 100644
--- a/trunk/fs/btrfs/extent-tree.c
+++ b/trunk/fs/btrfs/extent-tree.c
@@ -618,7 +618,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
 
-	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
+	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+		 BTRFS_BLOCK_GROUP_METADATA;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
@@ -1871,24 +1872,20 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
+			 u64 root_objectid, u64 owner, u64 offset)
 {
 	int ret;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
-					num_bytes,
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+					BTRFS_ADD_DELAYED_REF, NULL);
 	} else {
-		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-					num_bytes,
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+					BTRFS_ADD_DELAYED_REF, NULL);
 	}
 	return ret;
 }
@@ -2235,28 +2232,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
-		ref = select_delayed_ref(locked_ref);
-
-		if (ref && ref->seq &&
-		    btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
-			/*
-			 * there are still refs with lower seq numbers in the
-			 * process of being added. Don't run this ref yet.
-			 */
-			list_del_init(&locked_ref->cluster);
-			mutex_unlock(&locked_ref->mutex);
-			locked_ref = NULL;
-			delayed_refs->num_heads_ready++;
-			spin_unlock(&delayed_refs->lock);
-			cond_resched();
-			spin_lock(&delayed_refs->lock);
-			continue;
-		}
-
 		/*
 		 * record the must insert reserved flag before we
 		 * drop the spin lock.
@@ -2267,6 +2242,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		extent_op = locked_ref->extent_op;
 		locked_ref->extent_op = NULL;
 
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
 		if (!ref) {
 			/* All delayed refs have been processed, Go ahead
 			 * and send the head node to run_one_delayed_ref,
@@ -2287,7 +2267,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 				kfree(extent_op);
 
-				goto next;
+				cond_resched();
+				spin_lock(&delayed_refs->lock);
+				continue;
 			}
 
 			list_del_init(&locked_ref->cluster);
@@ -2297,12 +2279,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-		/*
-		 * we modified num_entries, but as we're currently running
-		 * delayed refs, skip
-		 *     wake_up(&delayed_refs->seq_wait);
-		 * here.
-		 */
+
 		spin_unlock(&delayed_refs->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2312,34 +2289,13 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		btrfs_put_delayed_ref(ref);
 		kfree(extent_op);
 		count++;
-next:
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       2 * 1024 * 1024,
-			       btrfs_get_alloc_profile(root, 0),
-			       CHUNK_ALLOC_NO_FORCE);
+
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
 	return count;
 }
 
-
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
-			unsigned long num_refs)
-{
-	struct list_head *first_seq = delayed_refs->seq_head.next;
-
-	spin_unlock(&delayed_refs->lock);
-	pr_debug("waiting for more refs (num %ld, first %p)\n",
-		 num_refs, first_seq);
-	wait_event(delayed_refs->seq_wait,
-		   num_refs != delayed_refs->num_entries ||
-		   delayed_refs->seq_head.next != first_seq);
-	pr_debug("done waiting for more refs (num %ld, first %p)\n",
-		 delayed_refs->num_entries, delayed_refs->seq_head.next);
-	spin_lock(&delayed_refs->lock);
-}
-
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2355,23 +2311,15 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *ref;
 	struct list_head cluster;
 	int ret;
-	u64 delayed_start;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
-	unsigned long num_refs = 0;
-	int consider_waiting;
 
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
-	do_chunk_alloc(trans, root->fs_info->extent_root,
-		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
-		       CHUNK_ALLOC_NO_FORCE);
-
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
-	consider_waiting = 0;
 	spin_lock(&delayed_refs->lock);
 	if (count == 0) {
 		count = delayed_refs->num_entries * 2;
@@ -2388,35 +2336,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 * of refs to process starting at the first one we are able to
 		 * lock
 		 */
-		delayed_start = delayed_refs->run_delayed_start;
 		ret = btrfs_find_ref_cluster(trans, &cluster,
 					     delayed_refs->run_delayed_start);
 		if (ret)
 			break;
 
-		if (delayed_start >= delayed_refs->run_delayed_start) {
-			if (consider_waiting == 0) {
-				/*
-				 * btrfs_find_ref_cluster looped. let's do one
-				 * more cycle. if we don't run any delayed ref
-				 * during that cycle (because we can't because
-				 * all of them are blocked) and if the number of
-				 * refs doesn't change, we avoid busy waiting.
-				 */
-				consider_waiting = 1;
-				num_refs = delayed_refs->num_entries;
-			} else {
-				wait_for_more_refs(delayed_refs, num_refs);
-				/*
-				 * after waiting, things have changed. we
-				 * dropped the lock and someone else might have
-				 * run some refs, built new clusters and so on.
-				 * therefore, we restart staleness detection.
-				 */
-				consider_waiting = 0;
-			}
-		}
-
 		ret = run_clustered_refs(trans, root, &cluster);
 		BUG_ON(ret < 0);
 
@@ -2424,11 +2348,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 		if (count == 0)
 			break;
-
-		if (ret || delayed_refs->run_delayed_start == 0) {
-			/* refs were run, let's reset staleness detection */
-			consider_waiting = 0;
-		}
 	}
 
 	if (run_all) {
@@ -2486,8 +2405,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	extent_op->update_key = 0;
 	extent_op->is_data = is_data ? 1 : 0;
 
-	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
-					  num_bytes, extent_op);
+	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
 	if (ret)
 		kfree(extent_op);
 	return ret;
@@ -2672,7 +2590,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc, int for_cow)
+			   int full_backref, int inc)
 {
 	u64 bytenr;
 	u64 num_bytes;
@@ -2685,7 +2603,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, int);
+			    u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
 	nritems = btrfs_header_nritems(buf);
@@ -2722,15 +2640,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset, for_cow);
+					   key.offset);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = btrfs_level_size(root, level - 1);
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0,
-					   for_cow);
+					   parent, ref_root, level - 1, 0);
 			if (ret)
 				goto fail;
 		}
@@ -2742,15 +2659,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref, int for_cow)
+		  struct extent_buffer *buf, int full_backref)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref, int for_cow)
+		  struct extent_buffer *buf, int full_backref)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3076,7 +2993,9 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
-	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+				BTRFS_BLOCK_GROUP_SYSTEM |
+				BTRFS_BLOCK_GROUP_METADATA);
 	found->total_bytes = total_bytes;
 	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
@@ -3097,27 +3016,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-	/* chunk -> extended profile */
-	if (extra_flags == 0)
-		extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		fs_info->avail_data_alloc_bits |= extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		fs_info->avail_metadata_alloc_bits |= extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		fs_info->avail_system_alloc_bits |= extra_flags;
+	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_RAID10 |
+				   BTRFS_BLOCK_GROUP_DUP);
+	if (extra_flags) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			fs_info->avail_data_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			fs_info->avail_metadata_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			fs_info->avail_system_alloc_bits |= extra_flags;
+	}
 }
 
-/*
- * @flags: available profiles in extended format (see ctree.h)
- *
- * Returns reduced profile in chunk format.  If profile changing is in
- * progress (either running or paused) picks the target profile (if it's
- * already available), otherwise falls back to plain reducing.
- */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	/*
@@ -3128,34 +3040,6 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
 
-	/* pick restriper's target profile if it's available */
-	spin_lock(&root->fs_info->balance_lock);
-	if (root->fs_info->balance_ctl) {
-		struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
-		u64 tgt = 0;
-
-		if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
-		    (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-		    (flags & bctl->data.target)) {
-			tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
-		} else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
-			   (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-			   (flags & bctl->sys.target)) {
-			tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
-		} else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
-			   (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-			   (flags & bctl->meta.target)) {
-			tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
-		}
-
-		if (tgt) {
-			spin_unlock(&root->fs_info->balance_lock);
-			flags = tgt;
-			goto out;
-		}
-	}
-	spin_unlock(&root->fs_info->balance_lock);
-
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
 	if (num_devices < 4)
@@ -3175,25 +3059,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
-	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
+	     (flags & BTRFS_BLOCK_GROUP_DUP)))
 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
-	}
-
-out:
-	/* extended -> chunk profile */
-	flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 	return flags;
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		flags |= root->fs_info->avail_data_alloc_bits;
+		flags |= root->fs_info->avail_data_alloc_bits &
+			 root->fs_info->data_alloc_profile;
 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		flags |= root->fs_info->avail_system_alloc_bits;
+		flags |= root->fs_info->avail_system_alloc_bits &
+			 root->fs_info->system_alloc_profile;
 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		flags |= root->fs_info->avail_metadata_alloc_bits;
-
+		flags |= root->fs_info->avail_metadata_alloc_bits &
+			 root->fs_info->metadata_alloc_profile;
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
@@ -3310,8 +3191,6 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
-	trace_btrfs_space_reservation(root->fs_info, "space_info",
-				      (u64)data_sinfo, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3331,8 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = BTRFS_I(inode)->space_info;
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
-	trace_btrfs_space_reservation(root->fs_info, "space_info",
-				      (u64)data_sinfo, bytes, 0);
 	spin_unlock(&data_sinfo->lock);
 }
 
@@ -3380,15 +3257,27 @@ static int should_alloc_chunk(struct btrfs_root *root,
 		if (num_bytes - num_allocated < thresh)
 			return 1;
 	}
+
+	/*
+	 * we have two similar checks here, one based on percentage
+	 * and once based on a hard number of 256MB.  The idea
+	 * is that if we have a good amount of free
+	 * room, don't allocate a chunk.  A good mount is
+	 * less than 80% utilized of the chunks we have allocated,
+	 * or more than 256MB free
+	 */
+	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+		return 0;
+
+	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
+		return 0;
+
 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-	/* 256MB or 2% of the FS */
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
-	/* system chunks need a much small threshold */
-	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		thresh = 32 * 1024 * 1024;
+	/* 256MB or 5% of the FS */
+	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
 
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
 		return 0;
 	return 1;
 }
@@ -3402,7 +3291,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	int wait_for_alloc = 0;
 	int ret = 0;
 
-	BUG_ON(!profile_is_valid(flags, 0));
+	flags = btrfs_reduce_alloc_profile(extent_root, flags);
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
@@ -3693,10 +3582,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	if (used <= space_info->total_bytes) {
 		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
-			trace_btrfs_space_reservation(root->fs_info,
-						      "space_info",
-						      (u64)space_info,
-						      orig_bytes, 1);
 			ret = 0;
 		} else {
 			/*
@@ -3764,10 +3649,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
-			trace_btrfs_space_reservation(root->fs_info,
-						      "space_info",
-						      (u64)space_info,
-						      orig_bytes, 1);
 			ret = 0;
 		} else {
 			wait_ordered = true;
@@ -3874,8 +3755,7 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 	spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
-				    struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 				    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3911,9 +3791,6 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_may_use -= num_bytes;
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      (u64)space_info,
-						      num_bytes, 0);
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -4070,8 +3947,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
 	if (global_rsv->full || global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
-				num_bytes);
+	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
 }
 
 /*
@@ -4130,15 +4006,11 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
 		sinfo->bytes_may_use += num_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      (u64)sinfo, num_bytes, 1);
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_may_use -= num_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      (u64)sinfo, num_bytes, 0);
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4173,8 +4045,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
-				(u64)-1);
+	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4191,8 +4062,6 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 	if (!trans->bytes_reserved)
 		return;
 
-	trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
-				      trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
@@ -4210,8 +4079,6 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 	 * when we are truly done with the orphan item.
 	 */
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
-	trace_btrfs_space_reservation(root->fs_info, "orphan",
-				      btrfs_ino(inode), num_bytes, 1);
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4219,8 +4086,6 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
-	trace_btrfs_space_reservation(root->fs_info, "orphan",
-				      btrfs_ino(inode), num_bytes, 0);
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
@@ -4348,11 +4213,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	/* Need to be holding the i_mutex here if we aren't free space cache */
 	if (btrfs_is_free_space_inode(root, inode))
 		flush = 0;
+	else
+		WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
 	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
-	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -4400,14 +4266,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		if (dropped)
 			to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-		if (to_free) {
+		if (to_free)
 			btrfs_block_rsv_release(root, block_rsv, to_free);
-			trace_btrfs_space_reservation(root->fs_info,
-						      "delalloc",
-						      btrfs_ino(inode),
-						      to_free, 0);
-		}
-		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 		return ret;
 	}
 
@@ -4418,11 +4278,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	}
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
-	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
-	if (to_reserve)
-		trace_btrfs_space_reservation(root->fs_info,"delalloc",
-					      btrfs_ino(inode), to_reserve, 1);
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
@@ -4452,8 +4308,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-	trace_btrfs_space_reservation(root->fs_info, "delalloc",
-				      btrfs_ino(inode), to_free, 0);
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
 }
@@ -4708,10 +4562,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 			cache->reserved += num_bytes;
 			space_info->bytes_reserved += num_bytes;
 			if (reserve == RESERVE_ALLOC) {
-				trace_btrfs_space_reservation(cache->fs_info,
-							      "space_info",
-							      (u64)space_info,
-							      num_bytes, 0);
+				BUG_ON(space_info->bytes_may_use < num_bytes);
 				space_info->bytes_may_use -= num_bytes;
 			}
 		}
@@ -5077,8 +4928,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
 
 	delayed_refs->num_entries--;
-	if (waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
@@ -5106,17 +4955,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   u64 parent, int last_ref, int for_cow)
+			   u64 parent, int last_ref)
 {
 	struct btrfs_block_group_cache *cache = NULL;
 	int ret;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
-					buf->start, buf->len,
-					parent, root->root_key.objectid,
-					btrfs_header_level(buf),
-					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+						parent, root->root_key.objectid,
+						btrfs_header_level(buf),
+						BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret);
 	}
 
@@ -5151,12 +4999,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int for_cow)
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 owner, u64 offset)
 {
 	int ret;
-	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
@@ -5168,17 +5016,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
-					num_bytes,
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+					BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret);
 	} else {
-		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-						num_bytes,
-						parent, root_objectid, owner,
-						offset, BTRFS_DROP_DELAYED_REF,
-						NULL, for_cow);
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner,
+					offset, BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret);
 	}
 	return ret;
@@ -5301,8 +5146,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	ins->objectid = 0;
 	ins->offset = 0;
 
-	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
-
 	space_info = __find_space_info(root->fs_info, data);
 	if (!space_info) {
 		printk(KERN_ERR "No space info for %llu\n", data);
@@ -5452,6 +5295,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		if (unlikely(block_group->ro))
 			goto loop;
 
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_cluster + empty_size) {
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
 		/*
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
@@ -5479,8 +5331,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
-				trace_btrfs_reserve_extent_cluster(root,
-					block_group, search_start, num_bytes);
 				goto checks;
 			}
 
@@ -5499,15 +5349,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 			 * plenty of times and not have found
 			 * anything, so we are likely way too
 			 * fragmented for the clustering stuff to find
-			 * anything.
-			 *
-			 * However, if the cluster is taken from the
-			 * current block group, release the cluster
-			 * first, so that we stand a better chance of
-			 * succeeding in the unclustered
-			 * allocation.  */
-			if (loop >= LOOP_NO_EMPTY_SIZE &&
-			    last_ptr->block_group != block_group) {
+			 * anything.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
 				spin_unlock(&last_ptr->refill_lock);
 				goto unclustered_alloc;
 			}
@@ -5518,11 +5361,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
-			if (loop >= LOOP_NO_EMPTY_SIZE) {
-				spin_unlock(&last_ptr->refill_lock);
-				goto unclustered_alloc;
-			}
-
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
@@ -5539,9 +5377,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
-					trace_btrfs_reserve_extent_cluster(root,
-						block_group, search_start,
-						num_bytes);
 					goto checks;
 				}
 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5566,15 +5401,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		}
 
 unclustered_alloc:
-		spin_lock(&block_group->free_space_ctl->tree_lock);
-		if (cached &&
-		    block_group->free_space_ctl->free_space <
-		    num_bytes + empty_cluster + empty_size) {
-			spin_unlock(&block_group->free_space_ctl->tree_lock);
-			goto loop;
-		}
-		spin_unlock(&block_group->free_space_ctl->tree_lock);
-
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
 		/*
@@ -5612,6 +5438,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 			goto loop;
 		}
 
+		ins->objectid = search_start;
+		ins->offset = num_bytes;
+
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
@@ -5628,8 +5457,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
-		trace_btrfs_reserve_extent(orig_root, block_group,
-					   search_start, num_bytes);
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
@@ -6015,10 +5842,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
-					 ins->offset, 0,
-					 root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+					 0, root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL);
 	return ret;
 }
 
@@ -6171,11 +5997,10 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	return ERR_PTR(-ENOSPC);
 }
 
-static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
-			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+	block_rsv_release_bytes(block_rsv, NULL, 0);
 }
 
 /*
@@ -6189,7 +6014,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size, int for_cow)
+					u64 hint, u64 empty_size)
 {
 	struct btrfs_key ins;
 	struct btrfs_block_rsv *block_rsv;
@@ -6205,7 +6030,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
 				   empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
-		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+		unuse_block_rsv(block_rsv, blocksize);
 		return ERR_PTR(ret);
 	}
 
@@ -6233,11 +6058,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		extent_op->update_flags = 1;
 		extent_op->is_data = 0;
 
-		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
-					ins.objectid,
+		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
 					ins.offset, parent, root_objectid,
 					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op, for_cow);
+					extent_op);
 		BUG_ON(ret);
 	}
 	return buf;
@@ -6254,7 +6078,6 @@ struct walk_control {
 	int keep_locks;
 	int reada_slot;
 	int reada_count;
-	int for_reloc;
 };
 
 #define DROP_REFERENCE	1
@@ -6393,9 +6216,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	/* wc->stage == UPDATE_BACKREF */
 	if (!(wc->flags[level] & flag)) {
 		BUG_ON(!path->locks[level]);
-		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+		ret = btrfs_inc_ref(trans, root, eb, 1);
 		BUG_ON(ret);
-		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+		ret = btrfs_dec_ref(trans, root, eb, 0);
 		BUG_ON(ret);
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
 						  eb->len, flag, 0);
@@ -6539,7 +6362,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-				root->root_key.objectid, level - 1, 0, 0);
+					root->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 	}
 	btrfs_tree_unlock(next);
@@ -6613,11 +6436,9 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 	if (wc->refs[level] == 1) {
 		if (level == 0) {
 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-				ret = btrfs_dec_ref(trans, root, eb, 1,
-						    wc->for_reloc);
+				ret = btrfs_dec_ref(trans, root, eb, 1);
 			else
-				ret = btrfs_dec_ref(trans, root, eb, 0,
-						    wc->for_reloc);
+				ret = btrfs_dec_ref(trans, root, eb, 0);
 			BUG_ON(ret);
 		}
 		/* make block locked assertion in clean_tree_block happy */
@@ -6644,7 +6465,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			       btrfs_header_owner(path->nodes[level + 1]));
 	}
 
-	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
+	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
 out:
 	wc->refs[level] = 0;
 	wc->flags[level] = 0;
@@ -6728,8 +6549,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * blocks are properly updated.
  */
 void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref,
-			 int for_reloc)
+			 struct btrfs_block_rsv *block_rsv, int update_ref)
 {
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans;
@@ -6817,7 +6637,6 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = update_ref;
 	wc->keep_locks = 0;
-	wc->for_reloc = for_reloc;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
@@ -6902,7 +6721,6 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
- * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
@@ -6947,7 +6765,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = 0;
 	wc->keep_locks = 1;
-	wc->for_reloc = 1;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
@@ -6975,29 +6792,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	if (root->fs_info->balance_ctl) {
-		struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
-		u64 tgt = 0;
-
-		/* pick restriper's target profile and return */
-		if (flags & BTRFS_BLOCK_GROUP_DATA &&
-		    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-			tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
-		} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
-			   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-			tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
-		} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
-			   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-			tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
-		}
-
-		if (tgt) {
-			/* extended -> chunk profile */
-			tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-			return tgt;
-		}
-	}
-
 	/*
 	 * we add in the count of missing devices because we want
 	 * to make sure that any RAID levels on a degraded FS
@@ -7291,7 +7085,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 * space to fit our block group in.
 		 */
 		if (device->total_bytes > device->bytes_used + min_free) {
-			ret = find_free_dev_extent(device, min_free,
+			ret = find_free_dev_extent(NULL, device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
 				dev_nr++;
@@ -7653,7 +7447,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
-	update_global_block_rsv(root->fs_info);
 
 	spin_lock(&cache->space_info->lock);
 	cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7673,22 +7466,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
-	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-	/* chunk -> extended profile */
-	if (extra_flags == 0)
-		extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		fs_info->avail_data_alloc_bits &= ~extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		fs_info->avail_system_alloc_bits &= ~extra_flags;
-}
-
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start)
 {
@@ -7699,7 +7476,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct inode *inode;
 	int ret;
-	int index;
 	int factor;
 
 	root = root->fs_info->extent_root;
@@ -7715,7 +7491,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	free_excluded_extents(root, block_group);
 
 	memcpy(&key, &block_group->key, sizeof(key));
-	index = get_block_group_index(block_group);
 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
 				  BTRFS_BLOCK_GROUP_RAID1 |
 				  BTRFS_BLOCK_GROUP_RAID10))
@@ -7790,8 +7565,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
-	if (list_empty(&block_group->space_info->block_groups[index]))
-		clear_avail_alloc_bits(root->fs_info, block_group->flags);
 	up_write(&block_group->space_info->groups_sem);
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c
index 9d09a4f81875..49f3c9dc09f4 100644
--- a/trunk/fs/btrfs/extent_io.c
+++ b/trunk/fs/btrfs/extent_io.c
@@ -18,7 +18,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
-#include "check-integrity.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1896,7 +1895,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 	}
 	bio->bi_bdev = dev->bdev;
 	bio_add_page(bio, page, length, start-page_offset(page));
-	btrfsic_submit_bio(WRITE_SYNC, bio);
+	submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&compl);
 
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2394,7 +2393,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
-		btrfsic_submit_bio(rw, bio);
+		submit_bio(rw, bio);
 
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
@@ -3580,7 +3579,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	atomic_set(&eb->blocking_writers, 0);
 	atomic_set(&eb->spinning_readers, 0);
 	atomic_set(&eb->spinning_writers, 0);
-	eb->lock_nested = 0;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
 
diff --git a/trunk/fs/btrfs/extent_io.h b/trunk/fs/btrfs/extent_io.h
index bc6a042cb6fc..7604c3001322 100644
--- a/trunk/fs/btrfs/extent_io.h
+++ b/trunk/fs/btrfs/extent_io.h
@@ -129,7 +129,6 @@ struct extent_buffer {
 	struct list_head leak_list;
 	struct rcu_head rcu_head;
 	atomic_t refs;
-	pid_t lock_owner;
 
 	/* count of read lock holders on the extent buffer */
 	atomic_t write_locks;
@@ -138,7 +137,6 @@ struct extent_buffer {
 	atomic_t blocking_readers;
 	atomic_t spinning_readers;
 	atomic_t spinning_writers;
-	int lock_nested;
 
 	/* protects write locks */
 	rwlock_t lock;
diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c
index 859ba2dd8890..034d98503229 100644
--- a/trunk/fs/btrfs/file.c
+++ b/trunk/fs/btrfs/file.c
@@ -678,7 +678,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset, 0);
+						start - extent_offset);
 				BUG_ON(ret);
 				*hint_byte = disk_bytenr;
 			}
@@ -753,7 +753,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset, 0);
+						extent_offset);
 				BUG_ON(ret);
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
@@ -962,7 +962,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   ino, orig_offset, 0);
+					   ino, orig_offset);
 		BUG_ON(ret);
 
 		if (split == start) {
@@ -989,7 +989,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset, 0);
+					ino, orig_offset);
 		BUG_ON(ret);
 	}
 	other_start = 0;
@@ -1006,7 +1006,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset, 0);
+					ino, orig_offset);
 		BUG_ON(ret);
 	}
 	if (del_nr == 0) {
@@ -1274,6 +1274,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 						   dirty_pages);
 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root, 1);
+		btrfs_throttle(root);
 
 		pos += copied;
 		num_written += copied;
diff --git a/trunk/fs/btrfs/free-space-cache.c b/trunk/fs/btrfs/free-space-cache.c
index d20ff87ca603..9a897bf79538 100644
--- a/trunk/fs/btrfs/free-space-cache.c
+++ b/trunk/fs/btrfs/free-space-cache.c
@@ -319,11 +319,9 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
 	io_ctl_unmap_page(io_ctl);
 
 	for (i = 0; i < io_ctl->num_pages; i++) {
-		if (io_ctl->pages[i]) {
-			ClearPageChecked(io_ctl->pages[i]);
-			unlock_page(io_ctl->pages[i]);
-			page_cache_release(io_ctl->pages[i]);
-		}
+		ClearPageChecked(io_ctl->pages[i]);
+		unlock_page(io_ctl->pages[i]);
+		page_cache_release(io_ctl->pages[i]);
 	}
 }
 
@@ -637,10 +635,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	if (!num_entries)
 		return 0;
 
-	ret = io_ctl_init(&io_ctl, inode, root);
-	if (ret)
-		return ret;
-
+	io_ctl_init(&io_ctl, inode, root);
 	ret = readahead_cache(inode);
 	if (ret)
 		goto out;
@@ -843,7 +838,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	struct io_ctl io_ctl;
 	struct list_head bitmap_list;
 	struct btrfs_key key;
-	u64 start, extent_start, extent_end, len;
+	u64 start, end, len;
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
@@ -854,9 +849,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (!i_size_read(inode))
 		return -1;
 
-	ret = io_ctl_init(&io_ctl, inode, root);
-	if (ret)
-		return -1;
+	io_ctl_init(&io_ctl, inode, root);
 
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
@@ -864,12 +857,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				     struct btrfs_free_cluster,
 				     block_group_list);
 
+	/*
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
 	/* Lock all pages first so we can lock the extent safely. */
 	io_ctl_prepare_pages(&io_ctl, inode, 0);
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state, GFP_NOFS);
 
+	/*
+	 * When searching for pinned extents, we need to start at our start
+	 * offset.
+	 */
+	if (block_group)
+		start = block_group->key.objectid;
+
 	node = rb_first(&ctl->free_space_offset);
 	if (!node && cluster) {
 		node = rb_first(&cluster->root);
@@ -912,20 +918,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * We want to add any pinned extents to our free space cache
 	 * so we don't leak the space
 	 */
-
-	/*
-	 * We shouldn't have switched the pinned extents yet so this is the
-	 * right one
-	 */
-	unpin = root->fs_info->pinned_extents;
-
-	if (block_group)
-		start = block_group->key.objectid;
-
 	while (block_group && (start < block_group->key.objectid +
 			       block_group->key.offset)) {
-		ret = find_first_extent_bit(unpin, start,
-					    &extent_start, &extent_end,
+		ret = find_first_extent_bit(unpin, start, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret) {
 			ret = 0;
@@ -933,21 +928,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		}
 
 		/* This pinned extent is out of our range */
-		if (extent_start >= block_group->key.objectid +
+		if (start >= block_group->key.objectid +
 		    block_group->key.offset)
 			break;
 
-		extent_start = max(extent_start, start);
-		extent_end = min(block_group->key.objectid +
-				 block_group->key.offset, extent_end + 1);
-		len = extent_end - extent_start;
+		len = block_group->key.objectid +
+			block_group->key.offset - start;
+		len = min(len, end + 1 - start);
 
 		entries++;
-		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
+		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
 		if (ret)
 			goto out_nospc;
 
-		start = extent_end;
+		start = end + 1;
 	}
 
 	/* Write out the bitmaps */
@@ -2289,23 +2283,23 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 				struct btrfs_free_space *entry,
 				struct btrfs_free_cluster *cluster,
-				u64 offset, u64 bytes,
-				u64 cont1_bytes, u64 min_bytes)
+				u64 offset, u64 bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	unsigned long next_zero;
 	unsigned long i;
-	unsigned long want_bits;
-	unsigned long min_bits;
+	unsigned long search_bits;
+	unsigned long total_bits;
 	unsigned long found_bits;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
 	int ret;
+	bool found = false;
 
 	i = offset_to_bit(entry->offset, block_group->sectorsize,
 			  max_t(u64, offset, entry->offset));
-	want_bits = bytes_to_bits(bytes, block_group->sectorsize);
-	min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+	search_bits = bytes_to_bits(bytes, block_group->sectorsize);
+	total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 
 again:
 	found_bits = 0;
@@ -2314,7 +2308,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
-		if (next_zero - i >= min_bits) {
+		if (next_zero - i >= search_bits) {
 			found_bits = next_zero - i;
 			break;
 		}
@@ -2324,9 +2318,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	if (!found_bits)
 		return -ENOSPC;
 
-	if (!total_found) {
+	if (!found) {
 		start = i;
 		cluster->max_size = 0;
+		found = true;
 	}
 
 	total_found += found_bits;
@@ -2334,8 +2329,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	if (cluster->max_size < found_bits * block_group->sectorsize)
 		cluster->max_size = found_bits * block_group->sectorsize;
 
-	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
-		i = next_zero + 1;
+	if (total_found < total_bits) {
+		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+		if (i - start > total_bits * 2) {
+			total_found = 0;
+			cluster->max_size = 0;
+			found = false;
+		}
 		goto again;
 	}
 
@@ -2346,31 +2346,28 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 				 &entry->offset_index, 1);
 	BUG_ON(ret);
 
-	trace_btrfs_setup_cluster(block_group, cluster,
-				  total_found * block_group->sectorsize, 1);
 	return 0;
 }
 
 /*
  * This searches the block group for just extents to fill the cluster with.
- * Try to find a cluster with at least bytes total bytes, at least one
- * extent of cont1_bytes, and other clusters of at least min_bytes.
  */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			struct btrfs_free_cluster *cluster,
 			struct list_head *bitmaps, u64 offset, u64 bytes,
-			u64 cont1_bytes, u64 min_bytes)
+			u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *first = NULL;
 	struct btrfs_free_space *entry = NULL;
+	struct btrfs_free_space *prev = NULL;
 	struct btrfs_free_space *last;
 	struct rb_node *node;
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
-	u64 total_size = 0;
+	u64 max_gap = 128 * 1024;
 
 	entry = tree_search_offset(ctl, offset, 0, 1);
 	if (!entry)
@@ -2380,8 +2377,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	 * We don't want bitmaps, so just move along until we find a normal
 	 * extent entry.
 	 */
-	while (entry->bitmap || entry->bytes < min_bytes) {
-		if (entry->bitmap && list_empty(&entry->list))
+	while (entry->bitmap) {
+		if (list_empty(&entry->list))
 			list_add_tail(&entry->list, bitmaps);
 		node = rb_next(&entry->offset_index);
 		if (!node)
@@ -2394,9 +2391,12 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	max_extent = entry->bytes;
 	first = entry;
 	last = entry;
+	prev = entry;
 
-	for (node = rb_next(&entry->offset_index); node;
-	     node = rb_next(&entry->offset_index)) {
+	while (window_free <= min_bytes) {
+		node = rb_next(&entry->offset_index);
+		if (!node)
+			return -ENOSPC;
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 
 		if (entry->bitmap) {
@@ -2405,18 +2405,26 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			continue;
 		}
 
-		if (entry->bytes < min_bytes)
-			continue;
-
-		last = entry;
-		window_free += entry->bytes;
-		if (entry->bytes > max_extent)
+		/*
+		 * we haven't filled the empty size and the window is
+		 * very large.  reset and try again
+		 */
+		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
+		    entry->offset - window_start > (min_bytes * 2)) {
+			first = entry;
+			window_start = entry->offset;
+			window_free = entry->bytes;
+			last = entry;
 			max_extent = entry->bytes;
+		} else {
+			last = entry;
+			window_free += entry->bytes;
+			if (entry->bytes > max_extent)
+				max_extent = entry->bytes;
+		}
+		prev = entry;
 	}
 
-	if (window_free < bytes || max_extent < cont1_bytes)
-		return -ENOSPC;
-
 	cluster->window_start = first->offset;
 
 	node = &first->offset_index;
@@ -2430,18 +2438,17 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
-		if (entry->bitmap || entry->bytes < min_bytes)
+		if (entry->bitmap)
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
-		total_size += entry->bytes;
 		BUG_ON(ret);
 	} while (node && entry != last);
 
 	cluster->max_size = max_extent;
-	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
+
 	return 0;
 }
 
@@ -2453,7 +2460,7 @@ static noinline int
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		     struct btrfs_free_cluster *cluster,
 		     struct list_head *bitmaps, u64 offset, u64 bytes,
-		     u64 cont1_bytes, u64 min_bytes)
+		     u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
@@ -2478,7 +2485,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		if (entry->bytes < min_bytes)
 			continue;
 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, cont1_bytes, min_bytes);
+					   bytes, min_bytes);
 		if (!ret)
 			return 0;
 	}
@@ -2492,7 +2499,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes+empty_size.
+ * is to find at least bytes free and up to empty_size + bytes free.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
@@ -2508,24 +2515,23 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	struct btrfs_free_space *entry, *tmp;
 	LIST_HEAD(bitmaps);
 	u64 min_bytes;
-	u64 cont1_bytes;
 	int ret;
 
-	/*
-	 * Choose the minimum extent size we'll require for this
-	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
-	 * For metadata, allow allocates with smaller extents.  For
-	 * data, keep it dense.
-	 */
+	/* for metadata, allow allocates with more holes */
 	if (btrfs_test_opt(root, SSD_SPREAD)) {
-		cont1_bytes = min_bytes = bytes + empty_size;
+		min_bytes = bytes + empty_size;
 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-		cont1_bytes = bytes;
-		min_bytes = block_group->sectorsize;
-	} else {
-		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
-		min_bytes = block_group->sectorsize;
-	}
+		/*
+		 * we want to do larger allocations when we are
+		 * flushing out the delayed refs, it helps prevent
+		 * making more work as we go along.
+		 */
+		if (trans->transaction->delayed_refs.flushing)
+			min_bytes = max(bytes, (bytes + empty_size) >> 1);
+		else
+			min_bytes = max(bytes, (bytes + empty_size) >> 4);
+	} else
+		min_bytes = max(bytes, (bytes + empty_size) >> 2);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2533,7 +2539,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	 * If we know we don't have enough space to make a cluster don't even
 	 * bother doing all the work to try and find one.
 	 */
-	if (ctl->free_space < bytes) {
+	if (ctl->free_space < min_bytes) {
 		spin_unlock(&ctl->tree_lock);
 		return -ENOSPC;
 	}
@@ -2546,17 +2552,11 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
-				 min_bytes);
-
-	INIT_LIST_HEAD(&bitmaps);
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-				      bytes + empty_size,
-				      cont1_bytes, min_bytes);
+				      bytes, min_bytes);
 	if (ret)
 		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-					   offset, bytes + empty_size,
-					   cont1_bytes, min_bytes);
+					   offset, bytes, min_bytes);
 
 	/* Clear our temporary list */
 	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,8 +2567,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		list_add_tail(&cluster->block_group_list,
 			      &block_group->cluster_list);
 		cluster->block_group = block_group;
-	} else {
-		trace_btrfs_failed_cluster_setup(block_group);
 	}
 out:
 	spin_unlock(&cluster->lock);
@@ -2590,57 +2588,17 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 	cluster->block_group = NULL;
 }
 
-static int do_trimming(struct btrfs_block_group_cache *block_group,
-		       u64 *total_trimmed, u64 start, u64 bytes,
-		       u64 reserved_start, u64 reserved_bytes)
-{
-	struct btrfs_space_info *space_info = block_group->space_info;
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	int ret;
-	int update = 0;
-	u64 trimmed = 0;
-
-	spin_lock(&space_info->lock);
-	spin_lock(&block_group->lock);
-	if (!block_group->ro) {
-		block_group->reserved += reserved_bytes;
-		space_info->bytes_reserved += reserved_bytes;
-		update = 1;
-	}
-	spin_unlock(&block_group->lock);
-	spin_unlock(&space_info->lock);
-
-	ret = btrfs_error_discard_extent(fs_info->extent_root,
-					 start, bytes, &trimmed);
-	if (!ret)
-		*total_trimmed += trimmed;
-
-	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
-
-	if (update) {
-		spin_lock(&space_info->lock);
-		spin_lock(&block_group->lock);
-		if (block_group->ro)
-			space_info->bytes_readonly += reserved_bytes;
-		block_group->reserved -= reserved_bytes;
-		space_info->bytes_reserved -= reserved_bytes;
-		spin_unlock(&space_info->lock);
-		spin_unlock(&block_group->lock);
-	}
-
-	return ret;
-}
-
-static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
-			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-	struct btrfs_free_space *entry;
-	struct rb_node *node;
+	struct btrfs_free_space *entry = NULL;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	u64 bytes = 0;
+	u64 actually_trimmed;
 	int ret = 0;
-	u64 extent_start;
-	u64 extent_bytes;
-	u64 bytes;
+
+	*trimmed = 0;
 
 	while (start < end) {
 		spin_lock(&ctl->tree_lock);
@@ -2651,118 +2609,81 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 		}
 
 		entry = tree_search_offset(ctl, start, 0, 1);
-		if (!entry) {
+		if (!entry)
+			entry = tree_search_offset(ctl,
+						   offset_to_bitmap(ctl, start),
+						   1, 1);
+
+		if (!entry || entry->offset >= end) {
 			spin_unlock(&ctl->tree_lock);
 			break;
 		}
 
-		/* skip bitmaps */
-		while (entry->bitmap) {
-			node = rb_next(&entry->offset_index);
-			if (!node) {
+		if (entry->bitmap) {
+			ret = search_bitmap(ctl, entry, &start, &bytes);
+			if (!ret) {
+				if (start >= end) {
+					spin_unlock(&ctl->tree_lock);
+					break;
+				}
+				bytes = min(bytes, end - start);
+				bitmap_clear_bits(ctl, entry, start, bytes);
+				if (entry->bytes == 0)
+					free_bitmap(ctl, entry);
+			} else {
+				start = entry->offset + BITS_PER_BITMAP *
+					block_group->sectorsize;
 				spin_unlock(&ctl->tree_lock);
-				goto out;
+				ret = 0;
+				continue;
 			}
-			entry = rb_entry(node, struct btrfs_free_space,
-					 offset_index);
-		}
-
-		if (entry->offset >= end) {
-			spin_unlock(&ctl->tree_lock);
-			break;
-		}
-
-		extent_start = entry->offset;
-		extent_bytes = entry->bytes;
-		start = max(start, extent_start);
-		bytes = min(extent_start + extent_bytes, end) - start;
-		if (bytes < minlen) {
-			spin_unlock(&ctl->tree_lock);
-			goto next;
+		} else {
+			start = entry->offset;
+			bytes = min(entry->bytes, end - start);
+			unlink_free_space(ctl, entry);
+			kmem_cache_free(btrfs_free_space_cachep, entry);
 		}
 
-		unlink_free_space(ctl, entry);
-		kmem_cache_free(btrfs_free_space_cachep, entry);
-
 		spin_unlock(&ctl->tree_lock);
 
-		ret = do_trimming(block_group, total_trimmed, start, bytes,
-				  extent_start, extent_bytes);
-		if (ret)
-			break;
-next:
-		start += bytes;
-
-		if (fatal_signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-
-		cond_resched();
-	}
-out:
-	return ret;
-}
-
-static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
-			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
-{
-	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-	struct btrfs_free_space *entry;
-	int ret = 0;
-	int ret2;
-	u64 bytes;
-	u64 offset = offset_to_bitmap(ctl, start);
-
-	while (offset < end) {
-		bool next_bitmap = false;
-
-		spin_lock(&ctl->tree_lock);
-
-		if (ctl->free_space < minlen) {
-			spin_unlock(&ctl->tree_lock);
-			break;
-		}
-
-		entry = tree_search_offset(ctl, offset, 1, 0);
-		if (!entry) {
-			spin_unlock(&ctl->tree_lock);
-			next_bitmap = true;
-			goto next;
-		}
-
-		bytes = minlen;
-		ret2 = search_bitmap(ctl, entry, &start, &bytes);
-		if (ret2 || start >= end) {
-			spin_unlock(&ctl->tree_lock);
-			next_bitmap = true;
-			goto next;
-		}
-
-		bytes = min(bytes, end - start);
-		if (bytes < minlen) {
-			spin_unlock(&ctl->tree_lock);
-			goto next;
-		}
-
-		bitmap_clear_bits(ctl, entry, start, bytes);
-		if (entry->bytes == 0)
-			free_bitmap(ctl, entry);
-
-		spin_unlock(&ctl->tree_lock);
+		if (bytes >= minlen) {
+			struct btrfs_space_info *space_info;
+			int update = 0;
+
+			space_info = block_group->space_info;
+			spin_lock(&space_info->lock);
+			spin_lock(&block_group->lock);
+			if (!block_group->ro) {
+				block_group->reserved += bytes;
+				space_info->bytes_reserved += bytes;
+				update = 1;
+			}
+			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
+
+			ret = btrfs_error_discard_extent(fs_info->extent_root,
+							 start,
+							 bytes,
+							 &actually_trimmed);
+
+			btrfs_add_free_space(block_group, start, bytes);
+			if (update) {
+				spin_lock(&space_info->lock);
+				spin_lock(&block_group->lock);
+				if (block_group->ro)
+					space_info->bytes_readonly += bytes;
+				block_group->reserved -= bytes;
+				space_info->bytes_reserved -= bytes;
+				spin_unlock(&space_info->lock);
+				spin_unlock(&block_group->lock);
+			}
 
-		ret = do_trimming(block_group, total_trimmed, start, bytes,
-				  start, bytes);
-		if (ret)
-			break;
-next:
-		if (next_bitmap) {
-			offset += BITS_PER_BITMAP * ctl->unit;
-		} else {
-			start += bytes;
-			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
-				offset += BITS_PER_BITMAP * ctl->unit;
+			if (ret)
+				break;
+			*trimmed += actually_trimmed;
 		}
+		start += bytes;
+		bytes = 0;
 
 		if (fatal_signal_pending(current)) {
 			ret = -ERESTARTSYS;
@@ -2775,22 +2696,6 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-			   u64 *trimmed, u64 start, u64 end, u64 minlen)
-{
-	int ret;
-
-	*trimmed = 0;
-
-	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
-	if (ret)
-		return ret;
-
-	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-
-	return ret;
-}
-
 /*
  * Find the left-most item in the cache tree, and then return the
  * smallest inode number in the item.
diff --git a/trunk/fs/btrfs/inode-map.c b/trunk/fs/btrfs/inode-map.c
index 213ffa86ce1b..f8962a957d65 100644
--- a/trunk/fs/btrfs/inode-map.c
+++ b/trunk/fs/btrfs/inode-map.c
@@ -438,8 +438,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 					  trans->bytes_reserved);
 	if (ret)
 		goto out;
-	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
-				      trans->bytes_reserved, 1);
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -500,8 +498,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 out_put:
 	iput(inode);
 out_release:
-	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
-				      trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
 	trans->block_rsv = rsv;
diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c
index 0da19a0ea00d..81b235a61f8c 100644
--- a/trunk/fs/btrfs/inode.c
+++ b/trunk/fs/btrfs/inode.c
@@ -1951,28 +1951,12 @@ enum btrfs_orphan_cleanup_state {
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
 	if (!list_empty(&root->orphan_list) ||
 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
 		return;
 
-	spin_lock(&root->orphan_lock);
-	if (!list_empty(&root->orphan_list)) {
-		spin_unlock(&root->orphan_lock);
-		return;
-	}
-
-	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
-		spin_unlock(&root->orphan_lock);
-		return;
-	}
-
-	block_rsv = root->orphan_block_rsv;
-	root->orphan_block_rsv = NULL;
-	spin_unlock(&root->orphan_lock);
-
 	if (root->orphan_item_inserted &&
 	    btrfs_root_refs(&root->root_item) > 0) {
 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1981,9 +1965,10 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 		root->orphan_item_inserted = 0;
 	}
 
-	if (block_rsv) {
-		WARN_ON(block_rsv->size > 0);
-		btrfs_free_block_rsv(root, block_rsv);
+	if (root->orphan_block_rsv) {
+		WARN_ON(root->orphan_block_rsv->size > 0);
+		btrfs_free_block_rsv(root, root->orphan_block_rsv);
+		root->orphan_block_rsv = NULL;
 	}
 }
 
@@ -2239,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				continue;
 			}
 			nr_truncate++;
+			/*
+			 * Need to hold the imutex for reservation purposes, not
+			 * a huge deal here but I have a WARN_ON in
+			 * btrfs_delalloc_reserve_space to catch offenders.
+			 */
+			mutex_lock(&inode->i_mutex);
 			ret = btrfs_truncate(inode);
+			mutex_unlock(&inode->i_mutex);
 		} else {
 			nr_unlink++;
 		}
@@ -2853,7 +2845,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 		BUG_ON(!root->fs_info->enospc_unlink);
 		root->fs_info->enospc_unlink = 0;
 	}
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3017,6 +3009,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
+	int encoding;
 	int ret;
 	int err = 0;
 	u64 ino = btrfs_ino(inode);
@@ -3066,6 +3059,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
+		encoding = 0;
 
 		if (found_key.objectid != ino)
 			break;
@@ -3078,6 +3072,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
+			encoding = btrfs_file_extent_compression(leaf, fi);
+			encoding |= btrfs_file_extent_encryption(leaf, fi);
+			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3105,7 +3103,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item) {
+			if (!del_item && !encoding) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
 				extent_num_bytes = new_size -
@@ -3181,7 +3179,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset, 0);
+						ino, extent_offset);
 			BUG_ON(ret);
 		}
 
@@ -3436,7 +3434,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
 		i_size_write(inode, newsize);
 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
 		ret = btrfs_update_inode(trans, root, inode);
-		btrfs_end_transaction(trans, root);
+		btrfs_end_transaction_throttle(trans, root);
 	} else {
 
 		/*
@@ -4657,7 +4655,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -4725,7 +4723,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4784,7 +4782,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -4850,7 +4848,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 out_fail:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -5123,7 +5121,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
-			BUG();
+			WARN_ON(1);
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
@@ -6404,7 +6402,10 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_start;
 	u64 page_end;
 
+	/* Need this to keep space reservations serialized */
+	mutex_lock(&inode->i_mutex);
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	mutex_unlock(&inode->i_mutex);
 	if (!ret)
 		ret = btrfs_update_time(vma->vm_file);
 	if (ret) {
@@ -6493,8 +6494,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!ret)
 		return VM_FAULT_LOCKED;
 	unlock_page(page);
-out:
 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out:
 	return ret;
 }
 
@@ -6667,7 +6668,7 @@ static int btrfs_truncate(struct inode *inode)
 			err = ret;
 
 		nr = trans->blocks_used;
-		ret = btrfs_end_transaction(trans, root);
+		ret = btrfs_end_transaction_throttle(trans, root);
 		btrfs_btree_balance_dirty(root, nr);
 	}
 
@@ -6748,7 +6749,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
 	mutex_init(&ei->log_mutex);
-	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		btrfs_end_log_trans(root);
 	}
 out_fail:
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 out_notrans:
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7246,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (!err)
 		d_instantiate(dentry, inode);
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
diff --git a/trunk/fs/btrfs/ioctl.c b/trunk/fs/btrfs/ioctl.c
index ab620014bcc3..5441ff1480fd 100644
--- a/trunk/fs/btrfs/ioctl.c
+++ b/trunk/fs/btrfs/ioctl.c
@@ -176,8 +176,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	struct btrfs_trans_handle *trans;
 	unsigned int flags, oldflags;
 	int ret;
-	u64 ip_oldflags;
-	unsigned int i_oldflags;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -194,9 +192,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	mutex_lock(&inode->i_mutex);
 
-	ip_oldflags = ip->flags;
-	i_oldflags = inode->i_flags;
-
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -254,24 +249,19 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		ret = PTR_ERR(trans);
-		goto out_drop;
-	}
+	trans = btrfs_join_transaction(root);
+	BUG_ON(IS_ERR(trans));
 
 	btrfs_update_iflags(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
 
 	btrfs_end_transaction(trans, root);
- out_drop:
-	if (ret) {
-		ip->flags = ip_oldflags;
-		inode->i_flags = i_oldflags;
-	}
 
 	mnt_drop_write_file(file);
+
+	ret = 0;
  out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
@@ -286,13 +276,14 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
+	struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_device *device;
 	struct request_queue *q;
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -321,7 +312,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
-	ret = btrfs_trim_fs(fs_info->tree_root, &range);
+	ret = btrfs_trim_fs(root, &range);
 	if (ret < 0)
 		return ret;
 
@@ -367,7 +358,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 		return PTR_ERR(trans);
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      0, objectid, NULL, 0, 0, 0, 0);
+				      0, objectid, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
@@ -867,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
 		return 0;
 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
 
+	mutex_lock(&inode->i_mutex);
 	ret = btrfs_delalloc_reserve_space(inode,
 					   num_pages << PAGE_CACHE_SHIFT);
+	mutex_unlock(&inode->i_mutex);
 	if (ret)
 		return ret;
 again:
@@ -1210,21 +1203,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mutex_lock(&root->fs_info->volume_mutex);
-	if (root->fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: balance in progress\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args)) {
-		ret = PTR_ERR(vol_args);
-		goto out;
-	}
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
+	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -1241,7 +1226,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
 		       (unsigned long long)devid);
 		ret = -EINVAL;
-		goto out_free;
+		goto out_unlock;
 	}
 	if (!strcmp(sizestr, "max"))
 		new_size = device->bdev->bd_inode->i_size;
@@ -1256,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		new_size = memparse(sizestr, NULL);
 		if (new_size == 0) {
 			ret = -EINVAL;
-			goto out_free;
+			goto out_unlock;
 		}
 	}
 
@@ -1265,7 +1250,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (mod < 0) {
 		if (new_size > old_size) {
 			ret = -EINVAL;
-			goto out_free;
+			goto out_unlock;
 		}
 		new_size = old_size - new_size;
 	} else if (mod > 0) {
@@ -1274,11 +1259,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
 	if (new_size < 256 * 1024 * 1024) {
 		ret = -EINVAL;
-		goto out_free;
+		goto out_unlock;
 	}
 	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
-		goto out_free;
+		goto out_unlock;
 	}
 
 	do_div(new_size, root->sectorsize);
@@ -1291,7 +1276,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
-			goto out_free;
+			goto out_unlock;
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
@@ -1299,10 +1284,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		ret = btrfs_shrink_device(device, new_size);
 	}
 
-out_free:
-	kfree(vol_args);
-out:
+out_unlock:
 	mutex_unlock(&root->fs_info->volume_mutex);
+	kfree(vol_args);
 	return ret;
 }
 
@@ -2068,25 +2052,14 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mutex_lock(&root->fs_info->volume_mutex);
-	if (root->fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: balance in progress\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args)) {
-		ret = PTR_ERR(vol_args);
-		goto out;
-	}
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
 	kfree(vol_args);
-out:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -2101,25 +2074,14 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	mutex_lock(&root->fs_info->volume_mutex);
-	if (root->fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: balance in progress\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args)) {
-		ret = PTR_ERR(vol_args);
-		goto out;
-	}
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_rm_device(root, vol_args->name);
 
 	kfree(vol_args);
-out:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -2465,8 +2427,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(inode),
-							new_key.offset - datao,
-							0);
+							new_key.offset - datao);
 					BUG_ON(ret);
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -3016,7 +2977,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
 	int ret = 0;
 	int size;
-	u64 extent_item_pos;
+	u64 extent_offset;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
@@ -3047,17 +3008,15 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	}
 
 	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
-	btrfs_release_path(path);
 
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-	extent_item_pos = loi->logical - key.objectid;
+	extent_offset = loi->logical - key.objectid;
 	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-					extent_item_pos, build_ino_list,
-					inodes);
+					extent_offset, build_ino_list, inodes);
 
 	if (ret < 0)
 		goto out;
@@ -3075,163 +3034,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	return ret;
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
-			       struct btrfs_ioctl_balance_args *bargs)
-{
-	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
-
-	bargs->flags = bctl->flags;
-
-	if (atomic_read(&fs_info->balance_running))
-		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
-	if (atomic_read(&fs_info->balance_pause_req))
-		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
-	if (atomic_read(&fs_info->balance_cancel_req))
-		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
-
-	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
-	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
-	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
-
-	if (lock) {
-		spin_lock(&fs_info->balance_lock);
-		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
-		spin_unlock(&fs_info->balance_lock);
-	} else {
-		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
-	}
-}
-
-static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_ioctl_balance_args *bargs;
-	struct btrfs_balance_control *bctl;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
-	mutex_lock(&fs_info->volume_mutex);
-	mutex_lock(&fs_info->balance_mutex);
-
-	if (arg) {
-		bargs = memdup_user(arg, sizeof(*bargs));
-		if (IS_ERR(bargs)) {
-			ret = PTR_ERR(bargs);
-			goto out;
-		}
-
-		if (bargs->flags & BTRFS_BALANCE_RESUME) {
-			if (!fs_info->balance_ctl) {
-				ret = -ENOTCONN;
-				goto out_bargs;
-			}
-
-			bctl = fs_info->balance_ctl;
-			spin_lock(&fs_info->balance_lock);
-			bctl->flags |= BTRFS_BALANCE_RESUME;
-			spin_unlock(&fs_info->balance_lock);
-
-			goto do_balance;
-		}
-	} else {
-		bargs = NULL;
-	}
-
-	if (fs_info->balance_ctl) {
-		ret = -EINPROGRESS;
-		goto out_bargs;
-	}
-
-	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
-	if (!bctl) {
-		ret = -ENOMEM;
-		goto out_bargs;
-	}
-
-	bctl->fs_info = fs_info;
-	if (arg) {
-		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
-		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
-		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-
-		bctl->flags = bargs->flags;
-	} else {
-		/* balance everything - no filters */
-		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
-	}
-
-do_balance:
-	ret = btrfs_balance(bctl, bargs);
-	/*
-	 * bctl is freed in __cancel_balance or in free_fs_info if
-	 * restriper was paused all the way until unmount
-	 */
-	if (arg) {
-		if (copy_to_user(arg, bargs, sizeof(*bargs)))
-			ret = -EFAULT;
-	}
-
-out_bargs:
-	kfree(bargs);
-out:
-	mutex_unlock(&fs_info->balance_mutex);
-	mutex_unlock(&fs_info->volume_mutex);
-	return ret;
-}
-
-static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	switch (cmd) {
-	case BTRFS_BALANCE_CTL_PAUSE:
-		return btrfs_pause_balance(root->fs_info);
-	case BTRFS_BALANCE_CTL_CANCEL:
-		return btrfs_cancel_balance(root->fs_info);
-	}
-
-	return -EINVAL;
-}
-
-static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
-					 void __user *arg)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_ioctl_balance_args *bargs;
-	int ret = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&fs_info->balance_mutex);
-	if (!fs_info->balance_ctl) {
-		ret = -ENOTCONN;
-		goto out;
-	}
-
-	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
-	if (!bargs) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	update_ioctl_balance_args(fs_info, 1, bargs);
-
-	if (copy_to_user(arg, bargs, sizeof(*bargs)))
-		ret = -EFAULT;
-
-	kfree(bargs);
-out:
-	mutex_unlock(&fs_info->balance_mutex);
-	return ret;
-}
-
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3276,7 +3078,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_DEV_INFO:
 		return btrfs_ioctl_dev_info(root, argp);
 	case BTRFS_IOC_BALANCE:
-		return btrfs_ioctl_balance(root, NULL);
+		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
@@ -3308,12 +3110,6 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_scrub_cancel(root, argp);
 	case BTRFS_IOC_SCRUB_PROGRESS:
 		return btrfs_ioctl_scrub_progress(root, argp);
-	case BTRFS_IOC_BALANCE_V2:
-		return btrfs_ioctl_balance(root, argp);
-	case BTRFS_IOC_BALANCE_CTL:
-		return btrfs_ioctl_balance_ctl(root, arg);
-	case BTRFS_IOC_BALANCE_PROGRESS:
-		return btrfs_ioctl_balance_progress(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/trunk/fs/btrfs/ioctl.h b/trunk/fs/btrfs/ioctl.h
index 4f69028a68c4..252ae9915de8 100644
--- a/trunk/fs/btrfs/ioctl.h
+++ b/trunk/fs/btrfs/ioctl.h
@@ -109,55 +109,6 @@ struct btrfs_ioctl_fs_info_args {
 	__u64 reserved[124];			/* pad to 1k */
 };
 
-/* balance control ioctl modes */
-#define BTRFS_BALANCE_CTL_PAUSE		1
-#define BTRFS_BALANCE_CTL_CANCEL	2
-
-/*
- * this is packed, because it should be exactly the same as its disk
- * byte order counterpart (struct btrfs_disk_balance_args)
- */
-struct btrfs_balance_args {
-	__u64 profiles;
-	__u64 usage;
-	__u64 devid;
-	__u64 pstart;
-	__u64 pend;
-	__u64 vstart;
-	__u64 vend;
-
-	__u64 target;
-
-	__u64 flags;
-
-	__u64 unused[8];
-} __attribute__ ((__packed__));
-
-/* report balance progress to userspace */
-struct btrfs_balance_progress {
-	__u64 expected;		/* estimated # of chunks that will be
-				 * relocated to fulfill the request */
-	__u64 considered;	/* # of chunks we have considered so far */
-	__u64 completed;	/* # of chunks relocated so far */
-};
-
-#define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0)
-#define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1)
-#define BTRFS_BALANCE_STATE_CANCEL_REQ	(1ULL << 2)
-
-struct btrfs_ioctl_balance_args {
-	__u64 flags;				/* in/out */
-	__u64 state;				/* out */
-
-	struct btrfs_balance_args data;		/* in/out */
-	struct btrfs_balance_args meta;		/* in/out */
-	struct btrfs_balance_args sys;		/* in/out */
-
-	struct btrfs_balance_progress stat;	/* out */
-
-	__u64 unused[72];			/* pad to 1k */
-};
-
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
 	__u64 treeid;
@@ -321,11 +272,6 @@ struct btrfs_ioctl_logical_ino_args {
 				 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
 			       struct btrfs_ioctl_fs_info_args)
-#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
-				   struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
-#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
-					struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/trunk/fs/btrfs/locking.c b/trunk/fs/btrfs/locking.c
index 5e178d8f7167..d77b67c4b275 100644
--- a/trunk/fs/btrfs/locking.c
+++ b/trunk/fs/btrfs/locking.c
@@ -33,14 +33,6 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
  */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-	if (eb->lock_nested) {
-		read_lock(&eb->lock);
-		if (eb->lock_nested && current->pid == eb->lock_owner) {
-			read_unlock(&eb->lock);
-			return;
-		}
-		read_unlock(&eb->lock);
-	}
 	if (rw == BTRFS_WRITE_LOCK) {
 		if (atomic_read(&eb->blocking_writers) == 0) {
 			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +57,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
  */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-	if (eb->lock_nested) {
-		read_lock(&eb->lock);
-		if (&eb->lock_nested && current->pid == eb->lock_owner) {
-			read_unlock(&eb->lock);
-			return;
-		}
-		read_unlock(&eb->lock);
-	}
 	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
 		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
 		write_lock(&eb->lock);
@@ -97,25 +81,12 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
-	read_lock(&eb->lock);
-	if (atomic_read(&eb->blocking_writers) &&
-	    current->pid == eb->lock_owner) {
-		/*
-		 * This extent is already write-locked by our thread. We allow
-		 * an additional read lock to be added because it's for the same
-		 * thread. btrfs_find_all_roots() depends on this as it may be
-		 * called on a partly (write-)locked tree.
-		 */
-		BUG_ON(eb->lock_nested);
-		eb->lock_nested = 1;
-		read_unlock(&eb->lock);
-		return;
-	}
-	read_unlock(&eb->lock);
 	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
 	read_lock(&eb->lock);
 	if (atomic_read(&eb->blocking_writers)) {
 		read_unlock(&eb->lock);
+		wait_event(eb->write_lock_wq,
+			   atomic_read(&eb->blocking_writers) == 0);
 		goto again;
 	}
 	atomic_inc(&eb->read_locks);
@@ -158,7 +129,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 	}
 	atomic_inc(&eb->write_locks);
 	atomic_inc(&eb->spinning_writers);
-	eb->lock_owner = current->pid;
 	return 1;
 }
 
@@ -167,15 +137,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
-	if (eb->lock_nested) {
-		read_lock(&eb->lock);
-		if (eb->lock_nested && current->pid == eb->lock_owner) {
-			eb->lock_nested = 0;
-			read_unlock(&eb->lock);
-			return;
-		}
-		read_unlock(&eb->lock);
-	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
 	atomic_dec(&eb->spinning_readers);
@@ -188,15 +149,6 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
-	if (eb->lock_nested) {
-		read_lock(&eb->lock);
-		if (eb->lock_nested && current->pid == eb->lock_owner) {
-			eb->lock_nested = 0;
-			read_unlock(&eb->lock);
-			return;
-		}
-		read_unlock(&eb->lock);
-	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
 	if (atomic_dec_and_test(&eb->blocking_readers))
@@ -229,7 +181,6 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 	WARN_ON(atomic_read(&eb->spinning_writers));
 	atomic_inc(&eb->spinning_writers);
 	atomic_inc(&eb->write_locks);
-	eb->lock_owner = current->pid;
 	return 0;
 }
 
diff --git a/trunk/fs/btrfs/relocation.c b/trunk/fs/btrfs/relocation.c
index 8c1aae2c845d..cfb55434a469 100644
--- a/trunk/fs/btrfs/relocation.c
+++ b/trunk/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset, 1);
+					   key.objectid, key.offset);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset, 1);
+					key.objectid, key.offset);
 		BUG_ON(ret);
 	}
 	if (dirty)
@@ -1778,23 +1778,21 @@ int replace_path(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0,
-					1);
+					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0, 1);
+					0);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0,
-					1);
+					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0, 1);
+					0);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -2246,7 +2244,7 @@ int merge_reloc_roots(struct reloc_control *rc)
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
-		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
 	}
 
 	if (found) {
@@ -2560,7 +2558,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0, 1);
+						node->level, 0);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2949,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
 	while (index <= last_index) {
+		mutex_lock(&inode->i_mutex);
 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+		mutex_unlock(&inode->i_mutex);
 		if (ret)
 			goto out;
 
diff --git a/trunk/fs/btrfs/scrub.c b/trunk/fs/btrfs/scrub.c
index 9770cc5bfb76..ddf2c90d3fc0 100644
--- a/trunk/fs/btrfs/scrub.c
+++ b/trunk/fs/btrfs/scrub.c
@@ -25,7 +25,6 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
-#include "check-integrity.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -310,7 +309,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	u8 ref_level;
 	unsigned long ptr = 0;
 	const int bufsize = 4096;
-	u64 extent_item_pos;
+	u64 extent_offset;
 
 	path = btrfs_alloc_path();
 
@@ -330,13 +329,12 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	if (ret < 0)
 		goto out;
 
-	extent_item_pos = swarn.logical - found_key.objectid;
+	extent_offset = swarn.logical - found_key.objectid;
 	swarn.extent_item_size = found_key.offset;
 
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-	btrfs_release_path(path);
 
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
@@ -353,7 +351,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	} else {
 		swarn.path = path;
 		iterate_extent_inodes(fs_info, path, found_key.objectid,
-					extent_item_pos,
+					extent_offset,
 					scrub_print_warning_inode, &swarn);
 	}
 
@@ -734,7 +732,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 	bio->bi_end_io = scrub_fixup_end_io;
 	bio->bi_private = &complete;
-	btrfsic_submit_bio(rw, bio);
+	submit_bio(rw, bio);
 
 	/* this will also unplug the queue */
 	wait_for_completion(&complete);
@@ -960,7 +958,7 @@ static int scrub_submit(struct scrub_dev *sdev)
 	sdev->curr = -1;
 	atomic_inc(&sdev->in_flight);
 
-	btrfsic_submit_bio(READ, sbio->bio);
+	submit_bio(READ, sbio->bio);
 
 	return 0;
 }
diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c
index 3ce97b217cbe..ae488aa1966a 100644
--- a/trunk/fs/btrfs/super.c
+++ b/trunk/fs/btrfs/super.c
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-	(void)close_ctree(btrfs_sb(sb)->tree_root);
-	/* FIXME: need to fix VFS to return error? */
-	/* AV: return it _where_?  ->put_super() can be triggered by any number
-	 * of async events, up to and including delivery of SIGKILL to the
-	 * last process that kept it busy.  Or segfault in the aforementioned
-	 * process...  Whom would you report that to?
-	 */
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	ret = close_ctree(root);
+	sb->s_fs_info = NULL;
+
+	(void)ret; /* FIXME: need to fix VFS to return error? */
 }
 
 enum {
@@ -163,11 +163,8 @@ enum {
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
-	Opt_check_integrity, Opt_check_integrity_including_extent_data,
-	Opt_check_integrity_print_mask,
-	Opt_err,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -202,10 +199,6 @@ static match_table_t tokens = {
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
-	{Opt_skip_balance, "skip_balance"},
-	{Opt_check_integrity, "check_int"},
-	{Opt_check_integrity_including_extent_data, "check_int_data"},
-	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
 	{Opt_err, NULL},
 };
 
@@ -404,40 +397,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
-		case Opt_skip_balance:
-			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
-			break;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-		case Opt_check_integrity_including_extent_data:
-			printk(KERN_INFO "btrfs: enabling check integrity"
-			       " including extent data\n");
-			btrfs_set_opt(info->mount_opt,
-				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
-			break;
-		case Opt_check_integrity:
-			printk(KERN_INFO "btrfs: enabling check integrity\n");
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
-			break;
-		case Opt_check_integrity_print_mask:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
-				info->check_integrity_print_mask = intarg;
-				printk(KERN_INFO "btrfs:"
-				       " check_integrity_print_mask 0x%x\n",
-				       info->check_integrity_print_mask);
-			}
-			break;
-#else
-		case Opt_check_integrity_including_extent_data:
-		case Opt_check_integrity:
-		case Opt_check_integrity_print_mask:
-			printk(KERN_ERR "btrfs: support for check_integrity*"
-			       " not compiled in!\n");
-			ret = -EINVAL;
-			goto out;
-#endif
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
@@ -541,8 +500,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 static struct dentry *get_default_root(struct super_block *sb,
 				       u64 subvol_objectid)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_root *root = sb->s_fs_info;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
@@ -572,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -586,7 +544,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 		 */
 		btrfs_free_path(path);
 		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = fs_info->fs_root;
+		new_root = root->fs_info->fs_root;
 		goto setup_root;
 	}
 
@@ -594,7 +552,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	btrfs_free_path(path);
 
 find_root:
-	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
@@ -630,7 +588,7 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *tree_root;
 	struct btrfs_key key;
 	int err;
 
@@ -645,16 +603,18 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_flags |= MS_POSIXACL;
 #endif
 
-	err = open_ctree(sb, fs_devices, (char *)data);
-	if (err) {
+	tree_root = open_ctree(sb, fs_devices, (char *)data);
+
+	if (IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
-		return err;
+		return PTR_ERR(tree_root);
 	}
+	sb->s_fs_info = tree_root;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
+	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
@@ -671,25 +631,23 @@ static int btrfs_fill_super(struct super_block *sb,
 
 	save_mount_options(sb, data);
 	cleancache_init_fs(sb);
-	sb->s_flags |= MS_ACTIVE;
 	return 0;
 
 fail_close:
-	close_ctree(fs_info->tree_root);
+	close_ctree(tree_root);
 	return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
 	trace_btrfs_sync_fs(wait);
 
 	if (!wait) {
-		filemap_flush(fs_info->btree_inode->i_mapping);
+		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
 
@@ -705,8 +663,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
-	struct btrfs_root *root = info->tree_root;
+	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+	struct btrfs_fs_info *info = root->fs_info;
 	char *compress_type;
 
 	if (btrfs_test_opt(root, DEGRADED))
@@ -764,25 +722,28 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",autodefrag");
 	if (btrfs_test_opt(root, INODE_MAP_CACHE))
 		seq_puts(seq, ",inode_cache");
-	if (btrfs_test_opt(root, SKIP_BALANCE))
-		seq_puts(seq, ",skip_balance");
 	return 0;
 }
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-	struct btrfs_fs_info *p = data;
-	struct btrfs_fs_info *fs_info = btrfs_sb(s);
+	struct btrfs_root *test_root = data;
+	struct btrfs_root *root = btrfs_sb(s);
 
-	return fs_info->fs_devices == p->fs_devices;
+	/*
+	 * If this super block is going away, return false as it
+	 * can't match as an existing super block.
+	 */
+	if (!atomic_read(&s->s_active))
+		return 0;
+	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-	int err = set_anon_super(s, data);
-	if (!err)
-		s->s_fs_info = data;
-	return err;
+	s->s_fs_info = data;
+
+	return set_anon_super(s, data);
 }
 
 /*
@@ -942,6 +903,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!fs_info)
 		return ERR_PTR(-ENOMEM);
 
+	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	if (!fs_info->tree_root) {
+		error = -ENOMEM;
+		goto error_fs_info;
+	}
+	fs_info->tree_root->fs_info = fs_info;
 	fs_info->fs_devices = fs_devices;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -961,30 +928,43 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+		 fs_info->tree_root);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
 	}
 
 	if (s->s_root) {
+		if ((flags ^ s->s_flags) & MS_RDONLY) {
+			deactivate_locked_super(s);
+			error = -EBUSY;
+			goto error_close_devices;
+		}
+
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
-		if ((flags ^ s->s_flags) & MS_RDONLY)
-			error = -EBUSY;
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		btrfs_sb(s)->bdev_holder = fs_type;
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return ERR_PTR(error);
+		}
+
+		s->s_flags |= MS_ACTIVE;
 	}
 
-	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
-	if (IS_ERR(root))
+	root = get_default_root(s, subvol_objectid);
+	if (IS_ERR(root)) {
 		deactivate_locked_super(s);
+		return root;
+	}
 
 	return root;
 
@@ -997,8 +977,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
 	ret = btrfs_parse_options(root, data);
@@ -1014,13 +993,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		ret =  btrfs_commit_super(root);
 		WARN_ON(ret);
 	} else {
-		if (fs_info->fs_devices->rw_devices == 0)
+		if (root->fs_info->fs_devices->rw_devices == 0)
 			return -EACCES;
 
-		if (btrfs_super_log_root(fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		ret = btrfs_cleanup_fs_roots(fs_info);
+		ret = btrfs_cleanup_fs_roots(root->fs_info);
 		WARN_ON(ret);
 
 		/* recover relocation */
@@ -1189,18 +1168,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = fs_info->super_copy;
-	struct list_head *head = &fs_info->space_info;
+	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
 	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
-	__be32 *fsid = (__be32 *)fs_info->fsid;
+	__be32 *fsid = (__be32 *)root->fs_info->fsid;
 	int ret;
 
 	/* holding chunk_muext to avoid allocating new chunks */
-	mutex_lock(&fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1219,14 +1198,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 	buf->f_bavail = total_free_data;
-	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
+	ret = btrfs_calc_avail_data_space(root, &total_free_data);
 	if (ret) {
-		mutex_unlock(&fs_info->chunk_mutex);
+		mutex_unlock(&root->fs_info->chunk_mutex);
 		return ret;
 	}
 	buf->f_bavail += total_free_data;
 	buf->f_bavail = buf->f_bavail >> bits;
-	mutex_unlock(&fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
@@ -1240,18 +1219,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static void btrfs_kill_super(struct super_block *sb)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	kill_anon_super(sb);
-	free_fs_info(fs_info);
-}
-
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.mount		= btrfs_mount,
-	.kill_sb	= btrfs_kill_super,
+	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
@@ -1285,17 +1257,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	mutex_lock(&fs_info->transaction_kthread_mutex);
-	mutex_lock(&fs_info->cleaner_mutex);
+	struct btrfs_root *root = btrfs_sb(sb);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+	mutex_lock(&root->fs_info->cleaner_mutex);
 	return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	mutex_unlock(&fs_info->cleaner_mutex);
-	mutex_unlock(&fs_info->transaction_kthread_mutex);
+	struct btrfs_root *root = btrfs_sb(sb);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 	return 0;
 }
 
diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c
index 287a6728b1ad..81376d94cd3c 100644
--- a/trunk/fs/btrfs/transaction.c
+++ b/trunk/fs/btrfs/transaction.c
@@ -36,8 +36,6 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
-		WARN_ON(transaction->delayed_refs.root.rb_node);
-		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
@@ -110,11 +108,8 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
-	cur_trans->delayed_refs.seq = 1;
-	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
-	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -326,8 +321,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	}
 
 	if (num_bytes) {
-		trace_btrfs_space_reservation(root->fs_info, "transaction",
-					      (u64)h, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
@@ -474,12 +467,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	while (count < 2) {
+	while (count < 4) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
 		if (cur &&
 		    trans->transaction->delayed_refs.num_heads_ready > 64) {
 			trans->delayed_ref_updates = 0;
+
+			/*
+			 * do a full flush if the transaction is trying
+			 * to close
+			 */
+			if (trans->transaction->delayed_refs.flushing)
+				cur = 0;
 			btrfs_run_delayed_refs(trans, root, cur);
 		} else {
 			break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
 		if (btrfs_header_backref_rev(root->node) <
 		    BTRFS_MIXED_BACKREF_REV)
-			btrfs_drop_snapshot(root, NULL, 0, 0);
+			btrfs_drop_snapshot(root, NULL, 0);
 		else
-			btrfs_drop_snapshot(root, NULL, 1, 0);
+			btrfs_drop_snapshot(root, NULL, 1);
 	}
 	return 0;
 }
diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c
index cb877e0886a7..3568374d419d 100644
--- a/trunk/fs/btrfs/tree-log.c
+++ b/trunk/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset, 0);
+						key->objectid, offset);
 				BUG_ON(ret);
 			} else {
 				/*
diff --git a/trunk/fs/btrfs/ulist.c b/trunk/fs/btrfs/ulist.c
deleted file mode 100644
index 12f5147bd2b1..000000000000
--- a/trunk/fs/btrfs/ulist.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (C) 2011 STRATO AG
- * written by Arne Jansen <sensille@gmx.net>
- * Distributed under the GNU GPL license version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include "ulist.h"
-
-/*
- * ulist is a generic data structure to hold a collection of unique u64
- * values. The only operations it supports is adding to the list and
- * enumerating it.
- * It is possible to store an auxiliary value along with the key.
- *
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
- *
- * A sample usage for ulists is the enumeration of directed graphs without
- * visiting a node twice. The pseudo-code could look like this:
- *
- * ulist = ulist_alloc();
- * ulist_add(ulist, root);
- * elem = NULL;
- *
- * while ((elem = ulist_next(ulist, elem)) {
- * 	for (all child nodes n in elem)
- *		ulist_add(ulist, n);
- *	do something useful with the node;
- * }
- * ulist_free(ulist);
- *
- * This assumes the graph nodes are adressable by u64. This stems from the
- * usage for tree enumeration in btrfs, where the logical addresses are
- * 64 bit.
- *
- * It is also useful for tree enumeration which could be done elegantly
- * recursively, but is not possible due to kernel stack limitations. The
- * loop would be similar to the above.
- */
-
-/**
- * ulist_init - freshly initialize a ulist
- * @ulist:	the ulist to initialize
- *
- * Note: don't use this function to init an already used ulist, use
- * ulist_reinit instead.
- */
-void ulist_init(struct ulist *ulist)
-{
-	ulist->nnodes = 0;
-	ulist->nodes = ulist->int_nodes;
-	ulist->nodes_alloced = ULIST_SIZE;
-}
-EXPORT_SYMBOL(ulist_init);
-
-/**
- * ulist_fini - free up additionally allocated memory for the ulist
- * @ulist:	the ulist from which to free the additional memory
- *
- * This is useful in cases where the base 'struct ulist' has been statically
- * allocated.
- */
-void ulist_fini(struct ulist *ulist)
-{
-	/*
-	 * The first ULIST_SIZE elements are stored inline in struct ulist.
-	 * Only if more elements are alocated they need to be freed.
-	 */
-	if (ulist->nodes_alloced > ULIST_SIZE)
-		kfree(ulist->nodes);
-	ulist->nodes_alloced = 0;	/* in case ulist_fini is called twice */
-}
-EXPORT_SYMBOL(ulist_fini);
-
-/**
- * ulist_reinit - prepare a ulist for reuse
- * @ulist:	ulist to be reused
- *
- * Free up all additional memory allocated for the list elements and reinit
- * the ulist.
- */
-void ulist_reinit(struct ulist *ulist)
-{
-	ulist_fini(ulist);
-	ulist_init(ulist);
-}
-EXPORT_SYMBOL(ulist_reinit);
-
-/**
- * ulist_alloc - dynamically allocate a ulist
- * @gfp_mask:	allocation flags to for base allocation
- *
- * The allocated ulist will be returned in an initialized state.
- */
-struct ulist *ulist_alloc(unsigned long gfp_mask)
-{
-	struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
-
-	if (!ulist)
-		return NULL;
-
-	ulist_init(ulist);
-
-	return ulist;
-}
-EXPORT_SYMBOL(ulist_alloc);
-
-/**
- * ulist_free - free dynamically allocated ulist
- * @ulist:	ulist to free
- *
- * It is not necessary to call ulist_fini before.
- */
-void ulist_free(struct ulist *ulist)
-{
-	if (!ulist)
-		return;
-	ulist_fini(ulist);
-	kfree(ulist);
-}
-EXPORT_SYMBOL(ulist_free);
-
-/**
- * ulist_add - add an element to the ulist
- * @ulist:	ulist to add the element to
- * @val:	value to add to ulist
- * @aux:	auxiliary value to store along with val
- * @gfp_mask:	flags to use for allocation
- *
- * Note: locking must be provided by the caller. In case of rwlocks write
- *       locking is needed
- *
- * Add an element to a ulist. The @val will only be added if it doesn't
- * already exist. If it is added, the auxiliary value @aux is stored along with
- * it. In case @val already exists in the ulist, @aux is ignored, even if
- * it differs from the already stored value.
- *
- * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
- * inserted.
- * In case of allocation failure -ENOMEM is returned and the ulist stays
- * unaltered.
- */
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-	      unsigned long gfp_mask)
-{
-	int i;
-
-	for (i = 0; i < ulist->nnodes; ++i) {
-		if (ulist->nodes[i].val == val)
-			return 0;
-	}
-
-	if (ulist->nnodes >= ulist->nodes_alloced) {
-		u64 new_alloced = ulist->nodes_alloced + 128;
-		struct ulist_node *new_nodes;
-		void *old = NULL;
-
-		/*
-		 * if nodes_alloced == ULIST_SIZE no memory has been allocated
-		 * yet, so pass NULL to krealloc
-		 */
-		if (ulist->nodes_alloced > ULIST_SIZE)
-			old = ulist->nodes;
-
-		new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
-				     gfp_mask);
-		if (!new_nodes)
-			return -ENOMEM;
-
-		if (!old)
-			memcpy(new_nodes, ulist->int_nodes,
-			       sizeof(ulist->int_nodes));
-
-		ulist->nodes = new_nodes;
-		ulist->nodes_alloced = new_alloced;
-	}
-	ulist->nodes[ulist->nnodes].val = val;
-	ulist->nodes[ulist->nnodes].aux = aux;
-	++ulist->nnodes;
-
-	return 1;
-}
-EXPORT_SYMBOL(ulist_add);
-
-/**
- * ulist_next - iterate ulist
- * @ulist:	ulist to iterate
- * @prev:	previously returned element or %NULL to start iteration
- *
- * Note: locking must be provided by the caller. In case of rwlocks only read
- *       locking is needed
- *
- * This function is used to iterate an ulist. The iteration is started with
- * @prev = %NULL. It returns the next element from the ulist or %NULL when the
- * end is reached. No guarantee is made with respect to the order in which
- * the elements are returned. They might neither be returned in order of
- * addition nor in ascending order.
- * It is allowed to call ulist_add during an enumeration. Newly added items
- * are guaranteed to show up in the running enumeration.
- */
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
-{
-	int next;
-
-	if (ulist->nnodes == 0)
-		return NULL;
-
-	if (!prev)
-		return &ulist->nodes[0];
-
-	next = (prev - ulist->nodes) + 1;
-	if (next < 0 || next >= ulist->nnodes)
-		return NULL;
-
-	return &ulist->nodes[next];
-}
-EXPORT_SYMBOL(ulist_next);
diff --git a/trunk/fs/btrfs/ulist.h b/trunk/fs/btrfs/ulist.h
deleted file mode 100644
index 2e25dec58ec0..000000000000
--- a/trunk/fs/btrfs/ulist.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2011 STRATO AG
- * written by Arne Jansen <sensille@gmx.net>
- * Distributed under the GNU GPL license version 2.
- *
- */
-
-#ifndef __ULIST__
-#define __ULIST__
-
-/*
- * ulist is a generic data structure to hold a collection of unique u64
- * values. The only operations it supports is adding to the list and
- * enumerating it.
- * It is possible to store an auxiliary value along with the key.
- *
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
- */
-
-/*
- * number of elements statically allocated inside struct ulist
- */
-#define ULIST_SIZE 16
-
-/*
- * element of the list
- */
-struct ulist_node {
-	u64 val;		/* value to store */
-	unsigned long aux;	/* auxiliary value saved along with the val */
-};
-
-struct ulist {
-	/*
-	 * number of elements stored in list
-	 */
-	unsigned long nnodes;
-
-	/*
-	 * number of nodes we already have room for
-	 */
-	unsigned long nodes_alloced;
-
-	/*
-	 * pointer to the array storing the elements. The first ULIST_SIZE
-	 * elements are stored inline. In this case the it points to int_nodes.
-	 * After exceeding ULIST_SIZE, dynamic memory is allocated.
-	 */
-	struct ulist_node *nodes;
-
-	/*
-	 * inline storage space for the first ULIST_SIZE entries
-	 */
-	struct ulist_node int_nodes[ULIST_SIZE];
-};
-
-void ulist_init(struct ulist *ulist);
-void ulist_fini(struct ulist *ulist);
-void ulist_reinit(struct ulist *ulist);
-struct ulist *ulist_alloc(unsigned long gfp_mask);
-void ulist_free(struct ulist *ulist);
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-	      unsigned long gfp_mask);
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
-
-#endif
diff --git a/trunk/fs/btrfs/volumes.c b/trunk/fs/btrfs/volumes.c
index 0b4e2af7954d..f4b839fd3c9d 100644
--- a/trunk/fs/btrfs/volumes.c
+++ b/trunk/fs/btrfs/volumes.c
@@ -23,7 +23,6 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
-#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -33,7 +32,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
-#include "check-integrity.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -248,7 +246,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 			sync_pending = 0;
 		}
 
-		btrfsic_submit_bio(cur->bi_rw, cur);
+		submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
 		if (need_resched())
@@ -708,6 +706,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	u64 devid;
 	u64 transid;
 
+	mutex_lock(&uuid_mutex);
+
 	flags |= FMODE_EXCL;
 	bdev = blkdev_get_by_path(path, flags, holder);
 
@@ -716,7 +716,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error;
 	}
 
-	mutex_lock(&uuid_mutex);
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
@@ -738,9 +737,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	mutex_unlock(&uuid_mutex);
 	blkdev_put(bdev, flags);
 error:
+	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -830,6 +829,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 
 /*
  * find_free_dev_extent - find free space in the specified device
+ * @trans:	transaction handler
  * @device:	the device which we search the free space in
  * @num_bytes:	the size of the free space that we need
  * @start:	store the start of the free space.
@@ -848,7 +848,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *len)
 {
 	struct btrfs_key key;
@@ -892,7 +893,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -1281,6 +1282,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
 		root->fs_info->avail_system_alloc_bits |
@@ -1450,6 +1452,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
 error_undo:
@@ -1466,7 +1469,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
@@ -1625,6 +1629,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
 	/*
@@ -1690,7 +1695,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
-		ret = btrfs_prepare_sprout(root);
+		ret = btrfs_prepare_sprout(trans, root);
 		BUG_ON(ret);
 	}
 
@@ -1752,7 +1757,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		ret = btrfs_relocate_sys_chunks(root);
 		BUG_ON(ret);
 	}
-
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
 	blkdev_put(bdev, FMODE_EXCL);
@@ -1760,7 +1766,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 	}
-	return ret;
+	goto out;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2071,362 +2077,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 	return ret;
 }
 
-static int insert_balance_item(struct btrfs_root *root,
-			       struct btrfs_balance_control *bctl)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_balance_item *item;
-	struct btrfs_disk_balance_args disk_bargs;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-	int ret, err;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		return PTR_ERR(trans);
-	}
-
-	key.objectid = BTRFS_BALANCE_OBJECTID;
-	key.type = BTRFS_BALANCE_ITEM_KEY;
-	key.offset = 0;
-
-	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      sizeof(*item));
-	if (ret)
-		goto out;
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
-
-	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
-
-	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
-	btrfs_set_balance_data(leaf, item, &disk_bargs);
-	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
-	btrfs_set_balance_meta(leaf, item, &disk_bargs);
-	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
-	btrfs_set_balance_sys(leaf, item, &disk_bargs);
-
-	btrfs_set_balance_flags(leaf, item, bctl->flags);
-
-	btrfs_mark_buffer_dirty(leaf);
-out:
-	btrfs_free_path(path);
-	err = btrfs_commit_transaction(trans, root);
-	if (err && !ret)
-		ret = err;
-	return ret;
-}
-
-static int del_balance_item(struct btrfs_root *root)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	int ret, err;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		return PTR_ERR(trans);
-	}
-
-	key.objectid = BTRFS_BALANCE_OBJECTID;
-	key.type = BTRFS_BALANCE_ITEM_KEY;
-	key.offset = 0;
-
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	ret = btrfs_del_item(trans, root, path);
-out:
-	btrfs_free_path(path);
-	err = btrfs_commit_transaction(trans, root);
-	if (err && !ret)
-		ret = err;
-	return ret;
-}
-
-/*
- * This is a heuristic used to reduce the number of chunks balanced on
- * resume after balance was interrupted.
- */
-static void update_balance_args(struct btrfs_balance_control *bctl)
-{
-	/*
-	 * Turn on soft mode for chunk types that were being converted.
-	 */
-	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
-		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
-	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
-		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
-	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
-		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
-
-	/*
-	 * Turn on usage filter if is not already used.  The idea is
-	 * that chunks that we have already balanced should be
-	 * reasonably full.  Don't do it for chunks that are being
-	 * converted - that will keep us from relocating unconverted
-	 * (albeit full) chunks.
-	 */
-	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
-	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
-		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
-		bctl->data.usage = 90;
-	}
-	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
-	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
-		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
-		bctl->sys.usage = 90;
-	}
-	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
-	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
-		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
-		bctl->meta.usage = 90;
-	}
-}
-
-/*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper.  Same goes for unset_balance_control.
- */
-static void set_balance_control(struct btrfs_balance_control *bctl)
-{
-	struct btrfs_fs_info *fs_info = bctl->fs_info;
-
-	BUG_ON(fs_info->balance_ctl);
-
-	spin_lock(&fs_info->balance_lock);
-	fs_info->balance_ctl = bctl;
-	spin_unlock(&fs_info->balance_lock);
-}
-
-static void unset_balance_control(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
-
-	BUG_ON(!fs_info->balance_ctl);
-
-	spin_lock(&fs_info->balance_lock);
-	fs_info->balance_ctl = NULL;
-	spin_unlock(&fs_info->balance_lock);
-
-	kfree(bctl);
-}
-
-/*
- * Balance filters.  Return 1 if chunk should be filtered out
- * (should not be balanced).
- */
-static int chunk_profiles_filter(u64 chunk_profile,
-				 struct btrfs_balance_args *bargs)
-{
-	chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-	if (chunk_profile == 0)
-		chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
-	if (bargs->profiles & chunk_profile)
-		return 0;
-
-	return 1;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor <= 0)
-		return 0;
-	if (factor >= 100)
-		return num;
-
-	num *= factor;
-	do_div(num, 100);
-	return num;
-}
-
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
-			      struct btrfs_balance_args *bargs)
-{
-	struct btrfs_block_group_cache *cache;
-	u64 chunk_used, user_thresh;
-	int ret = 1;
-
-	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
-	chunk_used = btrfs_block_group_used(&cache->item);
-
-	user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
-	if (chunk_used < user_thresh)
-		ret = 0;
-
-	btrfs_put_block_group(cache);
-	return ret;
-}
-
-static int chunk_devid_filter(struct extent_buffer *leaf,
-			      struct btrfs_chunk *chunk,
-			      struct btrfs_balance_args *bargs)
-{
-	struct btrfs_stripe *stripe;
-	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
-	int i;
-
-	for (i = 0; i < num_stripes; i++) {
-		stripe = btrfs_stripe_nr(chunk, i);
-		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
-			return 0;
-	}
-
-	return 1;
-}
-
-/* [pstart, pend) */
-static int chunk_drange_filter(struct extent_buffer *leaf,
-			       struct btrfs_chunk *chunk,
-			       u64 chunk_offset,
-			       struct btrfs_balance_args *bargs)
-{
-	struct btrfs_stripe *stripe;
-	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
-	u64 stripe_offset;
-	u64 stripe_length;
-	int factor;
-	int i;
-
-	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
-		return 0;
-
-	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
-	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
-		factor = 2;
-	else
-		factor = 1;
-	factor = num_stripes / factor;
-
-	for (i = 0; i < num_stripes; i++) {
-		stripe = btrfs_stripe_nr(chunk, i);
-		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
-			continue;
-
-		stripe_offset = btrfs_stripe_offset(leaf, stripe);
-		stripe_length = btrfs_chunk_length(leaf, chunk);
-		do_div(stripe_length, factor);
-
-		if (stripe_offset < bargs->pend &&
-		    stripe_offset + stripe_length > bargs->pstart)
-			return 0;
-	}
-
-	return 1;
-}
-
-/* [vstart, vend) */
-static int chunk_vrange_filter(struct extent_buffer *leaf,
-			       struct btrfs_chunk *chunk,
-			       u64 chunk_offset,
-			       struct btrfs_balance_args *bargs)
-{
-	if (chunk_offset < bargs->vend &&
-	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
-		/* at least part of the chunk is inside this vrange */
-		return 0;
-
-	return 1;
-}
-
-static int chunk_soft_convert_filter(u64 chunk_profile,
-				     struct btrfs_balance_args *bargs)
-{
-	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
-		return 0;
-
-	chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-	if (chunk_profile == 0)
-		chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
-	if (bargs->target & chunk_profile)
-		return 1;
-
-	return 0;
-}
-
-static int should_balance_chunk(struct btrfs_root *root,
-				struct extent_buffer *leaf,
-				struct btrfs_chunk *chunk, u64 chunk_offset)
-{
-	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
-	struct btrfs_balance_args *bargs = NULL;
-	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
-
-	/* type filter */
-	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
-	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
-		return 0;
-	}
-
-	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
-		bargs = &bctl->data;
-	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
-		bargs = &bctl->sys;
-	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
-		bargs = &bctl->meta;
-
-	/* profiles filter */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
-	    chunk_profiles_filter(chunk_type, bargs)) {
-		return 0;
-	}
-
-	/* usage filter */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
-	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
-		return 0;
-	}
-
-	/* devid filter */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
-	    chunk_devid_filter(leaf, chunk, bargs)) {
-		return 0;
-	}
-
-	/* drange filter, makes sense only with devid filter */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
-	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
-		return 0;
-	}
-
-	/* vrange filter */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
-	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
-		return 0;
-	}
-
-	/* soft profile changing mode */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
-	    chunk_soft_convert_filter(chunk_type, bargs)) {
-		return 0;
-	}
-
-	return 1;
-}
-
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -2436,28 +2086,29 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-static int __btrfs_balance(struct btrfs_fs_info *fs_info)
+int btrfs_balance(struct btrfs_root *dev_root)
 {
-	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
-	struct btrfs_root *chunk_root = fs_info->chunk_root;
-	struct btrfs_root *dev_root = fs_info->dev_root;
-	struct list_head *devices;
+	int ret;
+	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
-	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_key found_key;
+	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
 	struct btrfs_trans_handle *trans;
-	struct extent_buffer *leaf;
-	int slot;
-	int ret;
-	int enospc_errors = 0;
-	bool counting = true;
+	struct btrfs_key found_key;
+
+	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&dev_root->fs_info->volume_mutex);
+	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
-	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
@@ -2486,23 +2137,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		ret = -ENOMEM;
 		goto error;
 	}
-
-	/* zero out stat counters */
-	spin_lock(&fs_info->balance_lock);
-	memset(&bctl->stat, 0, sizeof(bctl->stat));
-	spin_unlock(&fs_info->balance_lock);
-again:
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
-		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
-		    atomic_read(&fs_info->balance_cancel_req)) {
-			ret = -ECANCELED;
-			goto error;
-		}
-
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -2512,19 +2151,15 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		 * failed
 		 */
 		if (ret == 0)
-			BUG(); /* FIXME break ? */
+			break;
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret) {
-			ret = 0;
+		if (ret)
 			break;
-		}
-
-		leaf = path->nodes[0];
-		slot = path->slots[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
 		if (found_key.objectid != key.objectid)
 			break;
 
@@ -2532,375 +2167,22 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		if (found_key.offset == 0)
 			break;
 
-		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
-
-		if (!counting) {
-			spin_lock(&fs_info->balance_lock);
-			bctl->stat.considered++;
-			spin_unlock(&fs_info->balance_lock);
-		}
-
-		ret = should_balance_chunk(chunk_root, leaf, chunk,
-					   found_key.offset);
 		btrfs_release_path(path);
-		if (!ret)
-			goto loop;
-
-		if (counting) {
-			spin_lock(&fs_info->balance_lock);
-			bctl->stat.expected++;
-			spin_unlock(&fs_info->balance_lock);
-			goto loop;
-		}
-
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
 			goto error;
-		if (ret == -ENOSPC) {
-			enospc_errors++;
-		} else {
-			spin_lock(&fs_info->balance_lock);
-			bctl->stat.completed++;
-			spin_unlock(&fs_info->balance_lock);
-		}
-loop:
 		key.offset = found_key.offset - 1;
 	}
-
-	if (counting) {
-		btrfs_release_path(path);
-		counting = false;
-		goto again;
-	}
+	ret = 0;
 error:
 	btrfs_free_path(path);
-	if (enospc_errors) {
-		printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
-		       enospc_errors);
-		if (!ret)
-			ret = -ENOSPC;
-	}
-
+	mutex_unlock(&dev_root->fs_info->volume_mutex);
 	return ret;
 }
 
-static inline int balance_need_close(struct btrfs_fs_info *fs_info)
-{
-	/* cancel requested || normal exit path */
-	return atomic_read(&fs_info->balance_cancel_req) ||
-		(atomic_read(&fs_info->balance_pause_req) == 0 &&
-		 atomic_read(&fs_info->balance_cancel_req) == 0);
-}
-
-static void __cancel_balance(struct btrfs_fs_info *fs_info)
-{
-	int ret;
-
-	unset_balance_control(fs_info);
-	ret = del_balance_item(fs_info->tree_root);
-	BUG_ON(ret);
-}
-
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
-			       struct btrfs_ioctl_balance_args *bargs);
-
-/*
- * Should be called with both balance and volume mutexes held
- */
-int btrfs_balance(struct btrfs_balance_control *bctl,
-		  struct btrfs_ioctl_balance_args *bargs)
-{
-	struct btrfs_fs_info *fs_info = bctl->fs_info;
-	u64 allowed;
-	int ret;
-
-	if (btrfs_fs_closing(fs_info) ||
-	    atomic_read(&fs_info->balance_pause_req) ||
-	    atomic_read(&fs_info->balance_cancel_req)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * In case of mixed groups both data and meta should be picked,
-	 * and identical options should be given for both of them.
-	 */
-	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
-	if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
-	    (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
-		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
-		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
-		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
-			printk(KERN_ERR "btrfs: with mixed groups data and "
-			       "metadata balance options must be the same\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/*
-	 * Profile changing sanity checks.  Skip them if a simple
-	 * balance is requested.
-	 */
-	if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
-	      BTRFS_BALANCE_ARGS_CONVERT))
-		goto do_balance;
-
-	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-	if (fs_info->fs_devices->num_devices == 1)
-		allowed |= BTRFS_BLOCK_GROUP_DUP;
-	else if (fs_info->fs_devices->num_devices < 4)
-		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
-	else
-		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-				BTRFS_BLOCK_GROUP_RAID10);
-
-	if (!profile_is_valid(bctl->data.target, 1) ||
-	    bctl->data.target & ~allowed) {
-		printk(KERN_ERR "btrfs: unable to start balance with target "
-		       "data profile %llu\n",
-		       (unsigned long long)bctl->data.target);
-		ret = -EINVAL;
-		goto out;
-	}
-	if (!profile_is_valid(bctl->meta.target, 1) ||
-	    bctl->meta.target & ~allowed) {
-		printk(KERN_ERR "btrfs: unable to start balance with target "
-		       "metadata profile %llu\n",
-		       (unsigned long long)bctl->meta.target);
-		ret = -EINVAL;
-		goto out;
-	}
-	if (!profile_is_valid(bctl->sys.target, 1) ||
-	    bctl->sys.target & ~allowed) {
-		printk(KERN_ERR "btrfs: unable to start balance with target "
-		       "system profile %llu\n",
-		       (unsigned long long)bctl->sys.target);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
-		printk(KERN_ERR "btrfs: dup for data is not allowed\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* allow to reduce meta or sys integrity only if force set */
-	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
-			BTRFS_BLOCK_GROUP_RAID10;
-	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	     (fs_info->avail_system_alloc_bits & allowed) &&
-	     !(bctl->sys.target & allowed)) ||
-	    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	     (fs_info->avail_metadata_alloc_bits & allowed) &&
-	     !(bctl->meta.target & allowed))) {
-		if (bctl->flags & BTRFS_BALANCE_FORCE) {
-			printk(KERN_INFO "btrfs: force reducing metadata "
-			       "integrity\n");
-		} else {
-			printk(KERN_ERR "btrfs: balance will reduce metadata "
-			       "integrity, use force if you want this\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-do_balance:
-	ret = insert_balance_item(fs_info->tree_root, bctl);
-	if (ret && ret != -EEXIST)
-		goto out;
-
-	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
-		BUG_ON(ret == -EEXIST);
-		set_balance_control(bctl);
-	} else {
-		BUG_ON(ret != -EEXIST);
-		spin_lock(&fs_info->balance_lock);
-		update_balance_args(bctl);
-		spin_unlock(&fs_info->balance_lock);
-	}
-
-	atomic_inc(&fs_info->balance_running);
-	mutex_unlock(&fs_info->balance_mutex);
-
-	ret = __btrfs_balance(fs_info);
-
-	mutex_lock(&fs_info->balance_mutex);
-	atomic_dec(&fs_info->balance_running);
-
-	if (bargs) {
-		memset(bargs, 0, sizeof(*bargs));
-		update_ioctl_balance_args(fs_info, 0, bargs);
-	}
-
-	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
-	    balance_need_close(fs_info)) {
-		__cancel_balance(fs_info);
-	}
-
-	wake_up(&fs_info->balance_wait_q);
-
-	return ret;
-out:
-	if (bctl->flags & BTRFS_BALANCE_RESUME)
-		__cancel_balance(fs_info);
-	else
-		kfree(bctl);
-	return ret;
-}
-
-static int balance_kthread(void *data)
-{
-	struct btrfs_balance_control *bctl =
-			(struct btrfs_balance_control *)data;
-	struct btrfs_fs_info *fs_info = bctl->fs_info;
-	int ret = 0;
-
-	mutex_lock(&fs_info->volume_mutex);
-	mutex_lock(&fs_info->balance_mutex);
-
-	set_balance_control(bctl);
-
-	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
-		printk(KERN_INFO "btrfs: force skipping balance\n");
-	} else {
-		printk(KERN_INFO "btrfs: continuing balance\n");
-		ret = btrfs_balance(bctl, NULL);
-	}
-
-	mutex_unlock(&fs_info->balance_mutex);
-	mutex_unlock(&fs_info->volume_mutex);
-	return ret;
-}
-
-int btrfs_recover_balance(struct btrfs_root *tree_root)
-{
-	struct task_struct *tsk;
-	struct btrfs_balance_control *bctl;
-	struct btrfs_balance_item *item;
-	struct btrfs_disk_balance_args disk_bargs;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
-	if (!bctl) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	key.objectid = BTRFS_BALANCE_OBJECTID;
-	key.type = BTRFS_BALANCE_ITEM_KEY;
-	key.offset = 0;
-
-	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out_bctl;
-	if (ret > 0) { /* ret = -ENOENT; */
-		ret = 0;
-		goto out_bctl;
-	}
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
-
-	bctl->fs_info = tree_root->fs_info;
-	bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
-
-	btrfs_balance_data(leaf, item, &disk_bargs);
-	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
-	btrfs_balance_meta(leaf, item, &disk_bargs);
-	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
-	btrfs_balance_sys(leaf, item, &disk_bargs);
-	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
-
-	tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
-	if (IS_ERR(tsk))
-		ret = PTR_ERR(tsk);
-	else
-		goto out;
-
-out_bctl:
-	kfree(bctl);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
-{
-	int ret = 0;
-
-	mutex_lock(&fs_info->balance_mutex);
-	if (!fs_info->balance_ctl) {
-		mutex_unlock(&fs_info->balance_mutex);
-		return -ENOTCONN;
-	}
-
-	if (atomic_read(&fs_info->balance_running)) {
-		atomic_inc(&fs_info->balance_pause_req);
-		mutex_unlock(&fs_info->balance_mutex);
-
-		wait_event(fs_info->balance_wait_q,
-			   atomic_read(&fs_info->balance_running) == 0);
-
-		mutex_lock(&fs_info->balance_mutex);
-		/* we are good with balance_ctl ripped off from under us */
-		BUG_ON(atomic_read(&fs_info->balance_running));
-		atomic_dec(&fs_info->balance_pause_req);
-	} else {
-		ret = -ENOTCONN;
-	}
-
-	mutex_unlock(&fs_info->balance_mutex);
-	return ret;
-}
-
-int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
-{
-	mutex_lock(&fs_info->balance_mutex);
-	if (!fs_info->balance_ctl) {
-		mutex_unlock(&fs_info->balance_mutex);
-		return -ENOTCONN;
-	}
-
-	atomic_inc(&fs_info->balance_cancel_req);
-	/*
-	 * if we are running just wait and return, balance item is
-	 * deleted in btrfs_balance in this case
-	 */
-	if (atomic_read(&fs_info->balance_running)) {
-		mutex_unlock(&fs_info->balance_mutex);
-		wait_event(fs_info->balance_wait_q,
-			   atomic_read(&fs_info->balance_running) == 0);
-		mutex_lock(&fs_info->balance_mutex);
-	} else {
-		/* __cancel_balance needs volume_mutex */
-		mutex_unlock(&fs_info->balance_mutex);
-		mutex_lock(&fs_info->volume_mutex);
-		mutex_lock(&fs_info->balance_mutex);
-
-		if (fs_info->balance_ctl)
-			__cancel_balance(fs_info);
-
-		mutex_unlock(&fs_info->volume_mutex);
-	}
-
-	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
-	atomic_dec(&fs_info->balance_cancel_req);
-	mutex_unlock(&fs_info->balance_mutex);
-	return 0;
-}
-
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -3041,7 +2323,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
@@ -3158,14 +2441,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		max_stripe_size = 1024 * 1024 * 1024;
 		max_chunk_size = 10 * max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		/* for larger filesystems, use larger metadata chunks */
-		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
-			max_stripe_size = 1024 * 1024 * 1024;
-		else
-			max_stripe_size = 256 * 1024 * 1024;
+		max_stripe_size = 256 * 1024 * 1024;
 		max_chunk_size = max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		max_stripe_size = 32 * 1024 * 1024;
+		max_stripe_size = 8 * 1024 * 1024;
 		max_chunk_size = 2 * max_stripe_size;
 	} else {
 		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -3217,7 +2496,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (total_avail == 0)
 			continue;
 
-		ret = find_free_dev_extent(device,
+		ret = find_free_dev_extent(trans, device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
 		if (ret && ret != -ENOSPC)
@@ -3408,7 +2687,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
 					     item_size);
 		BUG_ON(ret);
 	}
@@ -3473,7 +2752,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 		return ret;
 
 	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-				fs_info->avail_metadata_alloc_bits;
+			(fs_info->metadata_alloc_profile &
+			 fs_info->avail_metadata_alloc_bits);
 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -3483,7 +2763,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	sys_chunk_offset = chunk_offset + chunk_size;
 
 	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-				fs_info->avail_system_alloc_bits;
+			(fs_info->system_alloc_profile &
+			 fs_info->avail_system_alloc_bits);
 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -3620,13 +2901,26 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 stripe_nr;
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
+	int stripes_allocated = 8;
+	int stripes_required = 1;
 	int stripe_index;
 	int i;
-	int ret = 0;
 	int num_stripes;
 	int max_errors = 0;
 	struct btrfs_bio *bbio = NULL;
 
+	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+		stripes_allocated = 1;
+again:
+	if (bbio_ret) {
+		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
+				GFP_NOFS);
+		if (!bbio)
+			return -ENOMEM;
+
+		atomic_set(&bbio->error, 0);
+	}
+
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
@@ -3645,6 +2939,32 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	if (mirror_num > map->num_stripes)
 		mirror_num = 0;
 
+	/* if our btrfs_bio struct is too small, back off and try again */
+	if (rw & REQ_WRITE) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			stripes_required = map->num_stripes;
+			max_errors = 1;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripes_required = map->sub_stripes;
+			max_errors = 1;
+		}
+	}
+	if (rw & REQ_DISCARD) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+				 BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP |
+				 BTRFS_BLOCK_GROUP_RAID10)) {
+			stripes_required = map->num_stripes;
+		}
+	}
+	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	    stripes_allocated < stripes_required) {
+		stripes_allocated = map->num_stripes;
+		free_extent_map(em);
+		kfree(bbio);
+		goto again;
+	}
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -3660,7 +2980,10 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 
 	if (rw & REQ_DISCARD)
 		*length = min_t(u64, em->len - offset, *length);
-	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+			      BTRFS_BLOCK_GROUP_RAID1 |
+			      BTRFS_BLOCK_GROUP_RAID10 |
+			      BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 				map->stripe_len - stripe_offset);
@@ -3736,55 +3059,81 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
-	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
-	if (!bbio) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	atomic_set(&bbio->error, 0);
-
 	if (rw & REQ_DISCARD) {
-		int factor = 0;
-		int sub_stripes = 0;
-		u64 stripes_per_dev = 0;
-		u32 remaining_stripes = 0;
-
-		if (map->type &
-		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
-			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
-				sub_stripes = 1;
-			else
-				sub_stripes = map->sub_stripes;
-
-			factor = map->num_stripes / sub_stripes;
-			stripes_per_dev = div_u64_rem(stripe_nr_end -
-						      stripe_nr_orig,
-						      factor,
-						      &remaining_stripes);
-		}
-
 		for (i = 0; i < num_stripes; i++) {
 			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
 			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
-			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-					 BTRFS_BLOCK_GROUP_RAID10)) {
-				bbio->stripes[i].length = stripes_per_dev *
-							  map->stripe_len;
-				if (i / sub_stripes < remaining_stripes)
-					bbio->stripes[i].length +=
-						map->stripe_len;
-				if (i < sub_stripes)
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+				u64 stripes;
+				u32 last_stripe = 0;
+				int j;
+
+				div_u64_rem(stripe_nr_end - 1,
+					    map->num_stripes,
+					    &last_stripe);
+
+				for (j = 0; j < map->num_stripes; j++) {
+					u32 test;
+
+					div_u64_rem(stripe_nr_end - 1 - j,
+						    map->num_stripes, &test);
+					if (test == stripe_index)
+						break;
+				}
+				stripes = stripe_nr_end - 1 - j;
+				do_div(stripes, map->num_stripes);
+				bbio->stripes[i].length = map->stripe_len *
+					(stripes - stripe_nr + 1);
+
+				if (i == 0) {
 					bbio->stripes[i].length -=
 						stripe_offset;
-				if ((i / sub_stripes + 1) %
-				    sub_stripes == remaining_stripes)
+					stripe_offset = 0;
+				}
+				if (stripe_index == last_stripe)
 					bbio->stripes[i].length -=
 						stripe_end_offset;
-				if (i == sub_stripes - 1)
-					stripe_offset = 0;
+			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+				u64 stripes;
+				int j;
+				int factor = map->num_stripes /
+					     map->sub_stripes;
+				u32 last_stripe = 0;
+
+				div_u64_rem(stripe_nr_end - 1,
+					    factor, &last_stripe);
+				last_stripe *= map->sub_stripes;
+
+				for (j = 0; j < factor; j++) {
+					u32 test;
+
+					div_u64_rem(stripe_nr_end - 1 - j,
+						    factor, &test);
+
+					if (test ==
+					    stripe_index / map->sub_stripes)
+						break;
+				}
+				stripes = stripe_nr_end - 1 - j;
+				do_div(stripes, factor);
+				bbio->stripes[i].length = map->stripe_len *
+					(stripes - stripe_nr + 1);
+
+				if (i < map->sub_stripes) {
+					bbio->stripes[i].length -=
+						stripe_offset;
+					if (i == map->sub_stripes - 1)
+						stripe_offset = 0;
+				}
+				if (stripe_index >= last_stripe &&
+				    stripe_index <= (last_stripe +
+						     map->sub_stripes - 1)) {
+					bbio->stripes[i].length -=
+						stripe_end_offset;
+				}
 			} else
 				bbio->stripes[i].length = *length;
 
@@ -3806,22 +3155,15 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			stripe_index++;
 		}
 	}
-
-	if (rw & REQ_WRITE) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_RAID10 |
-				 BTRFS_BLOCK_GROUP_DUP)) {
-			max_errors = 1;
-		}
+	if (bbio_ret) {
+		*bbio_ret = bbio;
+		bbio->num_stripes = num_stripes;
+		bbio->max_errors = max_errors;
+		bbio->mirror_num = mirror_num;
 	}
-
-	*bbio_ret = bbio;
-	bbio->num_stripes = num_stripes;
-	bbio->max_errors = max_errors;
-	bbio->mirror_num = mirror_num;
 out:
 	free_extent_map(em);
-	return ret;
+	return 0;
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3962,7 +3304,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
-		btrfsic_submit_bio(rw, bio);
+		submit_bio(rw, bio);
 		bio_put(bio);
 		return 0;
 	}
@@ -4057,7 +3399,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
 			else
-				btrfsic_submit_bio(rw, bio);
+				submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
@@ -4226,7 +3568,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	struct btrfs_fs_devices *fs_devices;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	mutex_lock(&uuid_mutex);
 
 	fs_devices = root->fs_info->fs_devices->seed;
 	while (fs_devices) {
@@ -4264,6 +3606,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
 out:
+	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -4406,9 +3749,6 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&uuid_mutex);
-	lock_chunks(root);
-
 	/* first we search for all of the device items, and then we
 	 * read in all of the chunk items.  This way we can create chunk
 	 * mappings that reference all of the devices that are afound
@@ -4459,9 +3799,6 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	}
 	ret = 0;
 error:
-	unlock_chunks(root);
-	mutex_unlock(&uuid_mutex);
-
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/trunk/fs/btrfs/volumes.h b/trunk/fs/btrfs/volumes.h
index 19ac95048b88..78f2d4d4f37f 100644
--- a/trunk/fs/btrfs/volumes.h
+++ b/trunk/fs/btrfs/volumes.h
@@ -186,51 +186,6 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
-/*
- * Restriper's general type filter
- */
-#define BTRFS_BALANCE_DATA		(1ULL << 0)
-#define BTRFS_BALANCE_SYSTEM		(1ULL << 1)
-#define BTRFS_BALANCE_METADATA		(1ULL << 2)
-
-#define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
-					 BTRFS_BALANCE_SYSTEM |	    \
-					 BTRFS_BALANCE_METADATA)
-
-#define BTRFS_BALANCE_FORCE		(1ULL << 3)
-#define BTRFS_BALANCE_RESUME		(1ULL << 4)
-
-/*
- * Balance filters
- */
-#define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
-#define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
-#define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
-#define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
-#define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
-
-/*
- * Profile changing flags.  When SOFT is set we won't relocate chunk if
- * it already has the target profile (even though it may be
- * half-filled).
- */
-#define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
-#define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
-
-struct btrfs_balance_args;
-struct btrfs_balance_progress;
-struct btrfs_balance_control {
-	struct btrfs_fs_info *fs_info;
-
-	struct btrfs_balance_args data;
-	struct btrfs_balance_args meta;
-	struct btrfs_balance_args sys;
-
-	u64 flags;
-
-	struct btrfs_balance_progress stat;
-};
-
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
@@ -273,12 +228,9 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_balance_control *bctl,
-		  struct btrfs_ioctl_balance_args *bargs);
-int btrfs_recover_balance(struct btrfs_root *tree_root);
-int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
-int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_balance(struct btrfs_root *dev_root);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
 #endif
diff --git a/trunk/fs/btrfs/xattr.c b/trunk/fs/btrfs/xattr.c
index e7a5659087e6..3848b04e310e 100644
--- a/trunk/fs/btrfs/xattr.c
+++ b/trunk/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 	return ret;
 }
 
diff --git a/trunk/fs/namei.c b/trunk/fs/namei.c
index 208c6aa4a989..c283a1ec008e 100644
--- a/trunk/fs/namei.c
+++ b/trunk/fs/namei.c
@@ -140,19 +140,21 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-	char *result = __getname();
-	int retval;
-
-	if (!result)
-		return ERR_PTR(-ENOMEM);
-
-	retval = do_getname(filename, result);
-	if (retval < 0) {
-		if (retval == -ENOENT && empty)
-			*empty = 1;
-		if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-			__putname(result);
-			return ERR_PTR(retval);
+	char *tmp, *result;
+
+	result = ERR_PTR(-ENOMEM);
+	tmp = __getname();
+	if (tmp)  {
+		int retval = do_getname(filename, tmp);
+
+		result = tmp;
+		if (retval < 0) {
+			if (retval == -ENOENT && empty)
+				*empty = 1;
+			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+				__putname(tmp);
+				result = ERR_PTR(retval);
+			}
 		}
 	}
 	audit_getname(result);
diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c
index 9cde9edf9c4d..5485a5388ecb 100644
--- a/trunk/fs/proc/base.c
+++ b/trunk/fs/proc/base.c
@@ -198,7 +198,65 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+static struct mm_struct *__check_mem_permission(struct task_struct *task)
+{
+	struct mm_struct *mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * A task can always look at itself, in case it chooses
+	 * to use system calls instead of load instructions.
+	 */
+	if (task == current)
+		return mm;
+
+	/*
+	 * If current is actively ptrace'ing, and would also be
+	 * permitted to freshly attach with ptrace now, permit it.
+	 */
+	if (task_is_stopped_or_traced(task)) {
+		int match;
+		rcu_read_lock();
+		match = (ptrace_parent(task) == current);
+		rcu_read_unlock();
+		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
+			return mm;
+	}
+
+	/*
+	 * No one else is allowed.
+	 */
+	mmput(mm);
+	return ERR_PTR(-EPERM);
+}
+
+/*
+ * If current may access user memory in @task return a reference to the
+ * corresponding mm, otherwise ERR_PTR.
+ */
+static struct mm_struct *check_mem_permission(struct task_struct *task)
+{
+	struct mm_struct *mm;
+	int err;
+
+	/*
+	 * Avoid racing if task exec's as we might get a new mm but validate
+	 * against old credentials.
+	 */
+	err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return ERR_PTR(err);
+
+	mm = __check_mem_permission(task);
+	mutex_unlock(&task->signal->cred_guard_mutex);
+
+	return mm;
+}
+
+struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm;
 	int err;
@@ -209,7 +267,7 @@ static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 
 	mm = get_task_mm(task);
 	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, mode)) {
+			!ptrace_may_access(task, PTRACE_MODE_READ)) {
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}
@@ -218,11 +276,6 @@ static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	return mm;
 }
 
-struct mm_struct *mm_for_maps(struct task_struct *task)
-{
-	return mm_access(task, PTRACE_MODE_READ);
-}
-
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
 	int res = 0;
@@ -699,39 +752,38 @@ static const struct file_operations proc_single_file_operations = {
 
 static int mem_open(struct inode* inode, struct file* file)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-	struct mm_struct *mm;
-
-	if (!task)
-		return -ESRCH;
-
-	mm = mm_access(task, PTRACE_MODE_ATTACH);
-	put_task_struct(task);
-
-	if (IS_ERR(mm))
-		return PTR_ERR(mm);
-
+	file->private_data = (void*)((long)current->self_exec_id);
 	/* OK to pass negative loff_t, we can catch out-of-range */
 	file->f_mode |= FMODE_UNSIGNED_OFFSET;
-	file->private_data = mm;
-
 	return 0;
 }
 
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	int ret;
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	char *page;
 	unsigned long src = *ppos;
-	struct mm_struct *mm = file->private_data;
+	int ret = -ESRCH;
+	struct mm_struct *mm;
 
-	if (!mm)
-		return 0;
+	if (!task)
+		goto out_no_task;
 
+	ret = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		return -ENOMEM;
+		goto out;
+
+	mm = check_mem_permission(task);
+	ret = PTR_ERR(mm);
+	if (IS_ERR(mm))
+		goto out_free;
+
+	ret = -EIO;
+ 
+	if (file->private_data != (void*)((long)current->self_exec_id))
+		goto out_put;
 
 	ret = 0;
  
@@ -758,7 +810,13 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	}
 	*ppos = src;
 
+out_put:
+	mmput(mm);
+out_free:
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return ret;
 }
 
@@ -767,15 +825,27 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 {
 	int copied;
 	char *page;
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	unsigned long dst = *ppos;
-	struct mm_struct *mm = file->private_data;
+	struct mm_struct *mm;
 
-	if (!mm)
-		return 0;
+	copied = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
+	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		return -ENOMEM;
+		goto out_task;
+
+	mm = check_mem_permission(task);
+	copied = PTR_ERR(mm);
+	if (IS_ERR(mm))
+		goto out_free;
+
+	copied = -EIO;
+	if (file->private_data != (void *)((long)current->self_exec_id))
+		goto out_mm;
 
 	copied = 0;
 	while (count > 0) {
@@ -799,7 +869,13 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	}
 	*ppos = dst;
 
+out_mm:
+	mmput(mm);
+out_free:
 	free_page((unsigned long) page);
+out_task:
+	put_task_struct(task);
+out_no_task:
 	return copied;
 }
 
@@ -819,20 +895,11 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 	return file->f_pos;
 }
 
-static int mem_release(struct inode *inode, struct file *file)
-{
-	struct mm_struct *mm = file->private_data;
-
-	mmput(mm);
-	return 0;
-}
-
 static const struct file_operations proc_mem_operations = {
 	.llseek		= mem_lseek,
 	.read		= mem_read,
 	.write		= mem_write,
 	.open		= mem_open,
-	.release	= mem_release,
 };
 
 static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1132,6 +1199,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	ssize_t length;
 	uid_t loginuid;
 
+	if (!capable(CAP_AUDIT_CONTROL))
+		return -EPERM;
+
 	rcu_read_lock();
 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
 		rcu_read_unlock();
@@ -1160,7 +1230,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(loginuid);
+	length = audit_set_loginuid(current, loginuid);
 	if (likely(length == 0))
 		length = count;
 
diff --git a/trunk/fs/xfs/xfs_aops.c b/trunk/fs/xfs/xfs_aops.c
index 74b9baf36ac3..574d4ee9b625 100644
--- a/trunk/fs/xfs/xfs_aops.c
+++ b/trunk/fs/xfs/xfs_aops.c
@@ -111,7 +111,8 @@ xfs_ioend_new_eof(
 	xfs_fsize_t		bsize;
 
 	bsize = ioend->io_offset + ioend->io_size;
-	isize = MIN(i_size_read(VFS_I(ip)), bsize);
+	isize = MAX(ip->i_size, ip->i_new_size);
+	isize = MIN(isize, bsize);
 	return isize > ip->i_d.di_size ? isize : 0;
 }
 
@@ -125,7 +126,11 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.
+ * Update on-disk file size now that data has been written to disk.  The
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated.  If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
  *
  * This function does not block as blocking on the inode lock in IO completion
  * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1273,15 +1278,6 @@ xfs_end_io_direct_write(
 {
 	struct xfs_ioend	*ioend = iocb->private;
 
-	/*
-	 * While the generic direct I/O code updates the inode size, it does
-	 * so only after the end_io handler is called, which means our
-	 * end_io handler thinks the on-disk size is outside the in-core
-	 * size.  To prevent this just update it a little bit earlier here.
-	 */
-	if (offset + size > i_size_read(ioend->io_inode))
-		i_size_write(ioend->io_inode, offset + size);
-
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
 	 * completion handler was called.  Thus we need to protect
@@ -1344,11 +1340,12 @@ xfs_vm_write_failed(
 
 	if (to > inode->i_size) {
 		/*
-		 * Punch out the delalloc blocks we have already allocated.
-		 *
-		 * Don't bother with xfs_setattr given that nothing can have
-		 * made it to disk yet as the page is still locked at this
-		 * point.
+		 * punch out the delalloc blocks we have already allocated. We
+		 * don't call xfs_setattr() to do this as we may be in the
+		 * middle of a multi-iovec write and so the vfs inode->i_size
+		 * will not match the xfs ip->i_size and so it will zero too
+		 * much. Hence we jus truncate the page cache to zero what is
+		 * necessary and punch the delalloc blocks directly.
 		 */
 		struct xfs_inode	*ip = XFS_I(inode);
 		xfs_fileoff_t		start_fsb;
diff --git a/trunk/fs/xfs/xfs_attr.c b/trunk/fs/xfs/xfs_attr.c
index 08b9ac644c31..1e5d97f86ea8 100644
--- a/trunk/fs/xfs/xfs_attr.c
+++ b/trunk/fs/xfs/xfs_attr.c
@@ -827,6 +827,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	if (error)
 		goto out;
 
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
 	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
diff --git a/trunk/fs/xfs/xfs_attr_leaf.c b/trunk/fs/xfs/xfs_attr_leaf.c
index d25eafd4d28d..c1b55e596551 100644
--- a/trunk/fs/xfs/xfs_attr_leaf.c
+++ b/trunk/fs/xfs/xfs_attr_leaf.c
@@ -271,6 +271,10 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 	dp = args->dp;
 	mp = dp->i_mount;
 	dp->i_d.di_forkoff = forkoff;
+	dp->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+	dp->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 
 	ifp = dp->i_afp;
 	ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -322,6 +326,7 @@ xfs_attr_fork_reset(
 	ASSERT(ip->i_d.di_anextents == 0);
 	ASSERT(ip->i_afp == NULL);
 
+	ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
@@ -384,6 +389,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 				(args->op_flags & XFS_DA_OP_ADDNAME) ||
 				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
 				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+		dp->i_afp->if_ext_max =
+			XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+		dp->i_df.if_ext_max =
+			XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 		xfs_trans_log_inode(args->trans, dp,
 					XFS_ILOG_CORE | XFS_ILOG_ADATA);
 	}
diff --git a/trunk/fs/xfs/xfs_bmap.c b/trunk/fs/xfs/xfs_bmap.c
index 188ef2fbd628..d0ab78837057 100644
--- a/trunk/fs/xfs/xfs_bmap.c
+++ b/trunk/fs/xfs/xfs_bmap.c
@@ -249,27 +249,7 @@ xfs_bmbt_lookup_ge(
 }
 
 /*
- * Check if the inode needs to be converted to btree format.
- */
-static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
-{
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-		XFS_IFORK_NEXTENTS(ip, whichfork) >
-			XFS_IFORK_MAXEXT(ip, whichfork);
-}
-
-/*
- * Check if the inode should be converted to extent format.
- */
-static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
-{
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-		XFS_IFORK_NEXTENTS(ip, whichfork) <=
-			XFS_IFORK_MAXEXT(ip, whichfork);
-}
-
-/*
- * Update the record referred to by cur to the value given
+* Update the record referred to by cur to the value given
  * by [off, bno, len, state].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
@@ -703,8 +683,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist,
 					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -787,8 +767,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 				bma->firstblock, bma->flist, &bma->cur, 1,
 				&tmp_rval, XFS_DATA_FORK);
@@ -856,8 +836,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist, &bma->cur,
 					1, &tmp_rval, XFS_DATA_FORK);
@@ -904,7 +884,8 @@ xfs_bmap_add_extent_delay_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+	if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
@@ -1440,7 +1421,8 @@ xfs_bmap_add_extent_unwritten_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(cur == NULL);
@@ -1830,7 +1812,8 @@ xfs_bmap_add_extent_hole_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+	if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
@@ -3054,7 +3037,8 @@ xfs_bmap_extents_to_btree(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	/*
 	 * Make space in the inode incore.
 	 */
@@ -3200,8 +3184,13 @@ xfs_bmap_forkoff_reset(
 	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
 		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
 
-		if (dfl_forkoff > ip->i_d.di_forkoff)
+		if (dfl_forkoff > ip->i_d.di_forkoff) {
 			ip->i_d.di_forkoff = dfl_forkoff;
+			ip->i_df.if_ext_max =
+				XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+			ip->i_afp->if_ext_max =
+				XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+		}
 	}
 }
 
@@ -3441,6 +3430,8 @@ xfs_bmap_add_attrfork(
 	int			error;		/* error return value */
 
 	ASSERT(XFS_IFORK_Q(ip) == 0);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	mp = ip->i_mount;
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3495,9 +3486,12 @@ xfs_bmap_add_attrfork(
 		error = XFS_ERROR(EINVAL);
 		goto error1;
 	}
-
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
 	xfs_bmap_init(&flist, &firstblock);
@@ -3541,17 +3535,20 @@ xfs_bmap_add_attrfork(
 		} else
 			spin_unlock(&mp->m_sb_lock);
 	}
-
-	error = xfs_bmap_finish(&tp, &flist, &committed);
-	if (error)
+	if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
 		goto error2;
-	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
 error2:
 	xfs_bmap_cancel(&flist);
 error1:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 	return error;
 }
 
@@ -3997,8 +3994,11 @@ xfs_bmap_one_block(
 	xfs_bmbt_irec_t	s;		/* internal version of extent */
 
 #ifndef DEBUG
-	if (whichfork == XFS_DATA_FORK)
-		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+	if (whichfork == XFS_DATA_FORK) {
+		return S_ISREG(ip->i_d.di_mode) ?
+			(ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
+			(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+	}
 #endif	/* !DEBUG */
 	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
 		return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
 	xfs_bmbt_get_all(ep, &s);
 	rval = s.br_startoff == 0 && s.br_blockcount == 1;
 	if (rval && whichfork == XFS_DATA_FORK)
-		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+		ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
 	return rval;
 }
 
@@ -4379,6 +4379,8 @@ xfs_bmapi_read(
 	XFS_STATS_INC(xs_blk_mapr);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4869,6 +4871,8 @@ xfs_bmapi_write(
 		return XFS_ERROR(EIO);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	XFS_STATS_INC(xs_blk_mapw);
 
@@ -4977,7 +4981,8 @@ xfs_bmapi_write(
 	/*
 	 * Transform from btree to extents, give it cur.
 	 */
-	if (xfs_bmap_wants_extents(ip, whichfork)) {
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
 		int		tmp_logflags = 0;
 
 		ASSERT(bma.cur);
@@ -4987,10 +4992,10 @@ xfs_bmapi_write(
 		if (error)
 			goto error0;
 	}
-
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-	       XFS_IFORK_NEXTENTS(ip, whichfork) >
-		XFS_IFORK_MAXEXT(ip, whichfork));
+	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
 	error = 0;
 error0:
 	/*
@@ -5090,7 +5095,8 @@ xfs_bunmapi(
 
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
-
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
@@ -5316,8 +5322,7 @@ xfs_bunmapi(
 		 */
 		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
 		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
-			XFS_IFORK_MAXEXT(ip, whichfork) &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
 		    del.br_startoff > got.br_startoff &&
 		    del.br_startoff + del.br_blockcount <
 		    got.br_startoff + got.br_blockcount) {
@@ -5348,11 +5353,13 @@ xfs_bunmapi(
 		}
 	}
 	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	/*
 	 * Convert to a btree if necessary.
 	 */
-	if (xfs_bmap_needs_btree(ip, whichfork)) {
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
 		ASSERT(cur == NULL);
 		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
 			&cur, 0, &tmp_logflags, whichfork);
@@ -5363,7 +5370,8 @@ xfs_bunmapi(
 	/*
 	 * transform from btree to extents, give it cur
 	 */
-	else if (xfs_bmap_wants_extents(ip, whichfork)) {
+	else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
 		ASSERT(cur != NULL);
 		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
 			whichfork);
@@ -5374,6 +5382,8 @@ xfs_bunmapi(
 	/*
 	 * transform from extents to local?
 	 */
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	error = 0;
 error0:
 	/*
@@ -5424,7 +5434,7 @@ xfs_getbmapx_fix_eof_hole(
 	if (startblock == HOLESTARTBLOCK) {
 		mp = ip->i_mount;
 		out->bmv_block = -1;
-		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
 		fixlen -= out->bmv_offset;
 		if (prealloced && out->bmv_offset + out->bmv_length == end) {
 			/* Came to hole at EOF. Trim it. */
@@ -5512,7 +5522,7 @@ xfs_getbmap(
 			fixlen = XFS_MAXIOFFSET(mp);
 		} else {
 			prealloced = 0;
-			fixlen = XFS_ISIZE(ip);
+			fixlen = ip->i_size;
 		}
 	}
 
@@ -5541,7 +5551,7 @@ xfs_getbmap(
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-		if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
 			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
 			if (error)
 				goto out_unlock_iolock;
diff --git a/trunk/fs/xfs/xfs_dfrag.c b/trunk/fs/xfs/xfs_dfrag.c
index dd974a55c77d..654dc6f05bac 100644
--- a/trunk/fs/xfs/xfs_dfrag.c
+++ b/trunk/fs/xfs/xfs_dfrag.c
@@ -163,14 +163,12 @@ xfs_swap_extents_check_format(
 
 	/* Check temp in extent form to max in target */
 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
-			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
 		return EINVAL;
 
 	/* Check target in extent form to max in temp */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
-			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
 		return EINVAL;
 
 	/*
@@ -182,25 +180,18 @@ xfs_swap_extents_check_format(
 	 * (a common defrag case) which will occur when the temp inode is in
 	 * extent format...
 	 */
-	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		if (XFS_IFORK_BOFF(ip) &&
-		    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
-			return EINVAL;
-		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
-		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-			return EINVAL;
-	}
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    ((XFS_IFORK_BOFF(ip) &&
+	      tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+	     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
+		return EINVAL;
 
 	/* Reciprocal target->temp btree format checks */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		if (XFS_IFORK_BOFF(tip) &&
-		    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
-			return EINVAL;
-
-		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
-		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-			return EINVAL;
-	}
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    ((XFS_IFORK_BOFF(tip) &&
+	      ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+	     XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
+		return EINVAL;
 
 	return 0;
 }
@@ -357,6 +348,16 @@ xfs_swap_extents(
 	*ifp = *tifp;		/* struct copy */
 	*tifp = *tempifp;	/* struct copy */
 
+	/*
+	 * Fix the in-memory data fork values that are dependent on the fork
+	 * offset in the inode. We can't assume they remain the same as attr2
+	 * has dynamic fork offsets.
+	 */
+	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+					(uint)sizeof(xfs_bmbt_rec_t);
+	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+					(uint)sizeof(xfs_bmbt_rec_t);
+
 	/*
 	 * Fix the on-disk inode values
 	 */
diff --git a/trunk/fs/xfs/xfs_file.c b/trunk/fs/xfs/xfs_file.c
index 7e5bc872f2b4..f675f3d9d7b3 100644
--- a/trunk/fs/xfs/xfs_file.c
+++ b/trunk/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((iocb->ki_pos & target->bt_smask) ||
 		    (size & target->bt_smask)) {
-			if (iocb->ki_pos == i_size_read(inode))
+			if (iocb->ki_pos == ip->i_size)
 				return 0;
 			return -XFS_ERROR(EINVAL);
 		}
@@ -412,6 +412,51 @@ xfs_file_splice_read(
 	return ret;
 }
 
+STATIC void
+xfs_aio_write_isize_update(
+	struct inode	*inode,
+	loff_t		*ppos,
+	ssize_t		bytes_written)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		isize = i_size_read(inode);
+
+	if (bytes_written > 0)
+		XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+	if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+					*ppos > isize))
+		*ppos = isize;
+
+	if (*ppos > ip->i_size) {
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occurred.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+	struct xfs_inode	*ip,
+	xfs_fsize_t		new_size)
+{
+	if (new_size == ip->i_new_size) {
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+		if (new_size == ip->i_new_size)
+			ip->i_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
 /*
  * xfs_file_splice_write() does not use xfs_rw_ilock() because
  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -430,6 +475,7 @@ xfs_file_splice_write(
 {
 	struct inode		*inode = outfilp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		new_size;
 	int			ioflags = 0;
 	ssize_t			ret;
 
@@ -443,12 +489,19 @@ xfs_file_splice_write(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
+	new_size = *ppos + count;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_write_bytes, ret);
 
+	xfs_aio_write_isize_update(inode, ppos, ret);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -636,26 +689,28 @@ xfs_zero_eof(
 /*
  * Common pre-write limit and setup checks.
  *
- * Called with the iolocked held either shared and exclusive according to
- * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
- * if called for a direct write beyond i_size.
+ * Returns with iolock held according to @iolock.
  */
 STATIC ssize_t
 xfs_file_aio_write_checks(
 	struct file		*file,
 	loff_t			*pos,
 	size_t			*count,
+	xfs_fsize_t		*new_sizep,
 	int			*iolock)
 {
 	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		new_size;
 	int			error = 0;
 
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+	*new_sizep = 0;
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+		*iolock = 0;
 		return error;
 	}
 
@@ -665,21 +720,36 @@ xfs_file_aio_write_checks(
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
-	 * write.  If zeroing is needed and we are currently holding the
-	 * iolock shared, we need to update it to exclusive which involves
-	 * dropping all locks and relocking to maintain correct locking order.
-	 * If we do this, restart the function to ensure all checks and values
-	 * are still valid.
+	 * write. There is no need to issue zeroing if another in-flght IO ends
+	 * at or before this one If zeronig is needed and we are currently
+	 * holding the iolock shared, we need to update it to exclusive which
+	 * involves dropping all locks and relocking to maintain correct locking
+	 * order. If we do this, restart the function to ensure all checks and
+	 * values are still valid.
 	 */
-	if (*pos > i_size_read(inode)) {
+	if ((ip->i_new_size && *pos > ip->i_new_size) ||
+	    (!ip->i_new_size && *pos > ip->i_size)) {
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 			goto restart;
 		}
-		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+		error = -xfs_zero_eof(ip, *pos, ip->i_size);
 	}
+
+	/*
+	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+	 * We have already zeroed space beyond EOF (if necessary).  Only update
+	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
+	 */
+	new_size = *pos + *count;
+	if (new_size > ip->i_size) {
+		if (new_size > ip->i_new_size)
+			ip->i_new_size = new_size;
+		*new_sizep = new_size;
+	}
+
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
@@ -724,7 +794,9 @@ xfs_file_dio_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount)
+	size_t			ocount,
+	xfs_fsize_t		*new_size,
+	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -734,10 +806,10 @@ xfs_file_dio_aio_write(
 	ssize_t			ret = 0;
 	size_t			count = ocount;
 	int			unaligned_io = 0;
-	int			iolock;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
+	*iolock = 0;
 	if ((pos & target->bt_smask) || (count & target->bt_smask))
 		return -XFS_ERROR(EINVAL);
 
@@ -752,31 +824,31 @@ xfs_file_dio_aio_write(
 	 * EOF zeroing cases and fill out the new inode size as appropriate.
 	 */
 	if (unaligned_io || mapping->nrpages)
-		iolock = XFS_IOLOCK_EXCL;
+		*iolock = XFS_IOLOCK_EXCL;
 	else
-		iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, iolock);
+		*iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, *iolock);
 
 	/*
 	 * Recheck if there are cached pages that need invalidate after we got
 	 * the iolock to protect against other threads adding new pages while
 	 * we were waiting for the iolock.
 	 */
-	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
-		xfs_rw_iunlock(ip, iolock);
-		iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, iolock);
+	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, *iolock);
 	}
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
-		goto out;
+		return ret;
 
 	if (mapping->nrpages) {
 		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
 							FI_REMAPF_LOCKED);
 		if (ret)
-			goto out;
+			return ret;
 	}
 
 	/*
@@ -785,18 +857,15 @@ xfs_file_dio_aio_write(
 	 */
 	if (unaligned_io)
 		inode_dio_wait(inode);
-	else if (iolock == XFS_IOLOCK_EXCL) {
+	else if (*iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		iolock = XFS_IOLOCK_SHARED;
+		*iolock = XFS_IOLOCK_SHARED;
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_direct_write(iocb, iovp,
 			&nr_segs, pos, &iocb->ki_pos, count, ocount);
 
-out:
-	xfs_rw_iunlock(ip, iolock);
-
 	/* No fallback to buffered IO on errors for XFS. */
 	ASSERT(ret < 0 || ret == count);
 	return ret;
@@ -808,7 +877,9 @@ xfs_file_buffered_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount)
+	size_t			ocount,
+	xfs_fsize_t		*new_size,
+	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -816,14 +887,14 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
-	int			iolock = XFS_IOLOCK_EXCL;
 	size_t			count = ocount;
 
-	xfs_rw_ilock(ip, iolock);
+	*iolock = XFS_IOLOCK_EXCL;
+	xfs_rw_ilock(ip, *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
-		goto out;
+		return ret;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -837,15 +908,13 @@ xfs_file_buffered_aio_write(
 	 * page locks and retry *once*
 	 */
 	if (ret == -ENOSPC && !enospc) {
-		enospc = 1;
 		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (!ret)
-			goto write_retry;
+		if (ret)
+			return ret;
+		enospc = 1;
+		goto write_retry;
 	}
-
 	current->backing_dev_info = NULL;
-out:
-	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -861,7 +930,9 @@ xfs_file_aio_write(
 	struct inode		*inode = mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
+	int			iolock;
 	size_t			ocount = 0;
+	xfs_fsize_t		new_size = 0;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -880,22 +951,33 @@ xfs_file_aio_write(
 		return -EIO;
 
 	if (unlikely(file->f_flags & O_DIRECT))
-		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &new_size, &iolock);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						  ocount);
+						ocount, &new_size, &iolock);
+
+	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
-	if (ret > 0) {
-		ssize_t err;
+	if (ret <= 0)
+		goto out_unlock;
 
-		XFS_STATS_ADD(xs_write_bytes, ret);
+	/* Handle various SYNC-type writes */
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
+		int error;
 
-		/* Handle various SYNC-type writes */
-		err = generic_write_sync(file, pos, ret);
-		if (err < 0)
-			ret = err;
+		xfs_rw_iunlock(ip, iolock);
+		error = xfs_file_fsync(file, pos, end,
+				      (file->f_flags & __O_SYNC) ? 0 : 1);
+		xfs_rw_ilock(ip, iolock);
+		if (error)
+			ret = error;
 	}
 
+out_unlock:
+	xfs_aio_write_newsize_update(ip, new_size);
+	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
diff --git a/trunk/fs/xfs/xfs_fs_subr.c b/trunk/fs/xfs/xfs_fs_subr.c
index 652b875a9d4c..ed88ed16811c 100644
--- a/trunk/fs/xfs/xfs_fs_subr.c
+++ b/trunk/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
 		return -filemap_fdatawait_range(mapping, first,
-					last == -1 ? XFS_ISIZE(ip) - 1 : last);
+					last == -1 ? ip->i_size - 1 : last);
 	}
 	return 0;
 }
diff --git a/trunk/fs/xfs/xfs_iget.c b/trunk/fs/xfs/xfs_iget.c
index 8c3e46394d48..3960a066d7ff 100644
--- a/trunk/fs/xfs/xfs_iget.c
+++ b/trunk/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
 
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(completion_done(&ip->i_flush));
 	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,6 +94,8 @@ xfs_inode_alloc(
 	ip->i_update_core = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+	ip->i_size = 0;
+	ip->i_new_size = 0;
 
 	return ip;
 }
@@ -148,7 +150,7 @@ xfs_inode_free(
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(completion_done(&ip->i_flush));
 
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
@@ -448,6 +450,8 @@ xfs_iget(
 
 	*ipp = ip;
 
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
 	/*
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
@@ -711,19 +715,3 @@ xfs_isilocked(
 	return 0;
 }
 #endif
-
-void
-__xfs_iflock(
-	struct xfs_inode	*ip)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-		if (xfs_isiflocked(ip))
-			io_schedule();
-	} while (!xfs_iflock_nowait(ip));
-
-	finish_wait(wq, &wait.wait);
-}
diff --git a/trunk/fs/xfs/xfs_inode.c b/trunk/fs/xfs/xfs_inode.c
index b21022499c2e..9dda7cc32848 100644
--- a/trunk/fs/xfs/xfs_inode.c
+++ b/trunk/fs/xfs/xfs_inode.c
@@ -299,8 +299,11 @@ xfs_iformat(
 {
 	xfs_attr_shortform_t	*atp;
 	int			size;
-	int			error = 0;
+	int			error;
 	xfs_fsize_t             di_size;
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	error = 0;
 
 	if (unlikely(be32_to_cpu(dip->di_nextents) +
 		     be16_to_cpu(dip->di_anextents) >
@@ -347,6 +350,7 @@ xfs_iformat(
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		ip->i_d.di_size = 0;
+		ip->i_size = 0;
 		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 		break;
 
@@ -405,10 +409,10 @@ xfs_iformat(
 	}
 	if (!XFS_DFORK_Q(dip))
 		return 0;
-
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	switch (dip->di_aformat) {
 	case XFS_DINODE_FMT_LOCAL:
 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -600,11 +604,10 @@ xfs_iformat_btree(
 	 * or the number of extents is greater than the number of
 	 * blocks.
 	 */
-	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-			XFS_IFORK_MAXEXT(ip, whichfork) ||
-		     XFS_BMDR_SPACE_CALC(nrecs) >
-			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
-		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+	    || XFS_BMDR_SPACE_CALC(nrecs) >
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
+	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 			(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -832,6 +835,12 @@ xfs_iread(
 		 * with the uninitialized part of it.
 		 */
 		ip->i_d.di_mode = 0;
+		/*
+		 * Initialize the per-fork minima and maxima for a new
+		 * inode here.  xfs_iformat will do it for old inodes.
+		 */
+		ip->i_df.if_ext_max =
+			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	}
 
 	/*
@@ -852,6 +861,7 @@ xfs_iread(
 	}
 
 	ip->i_delayed_blks = 0;
+	ip->i_size = ip->i_d.di_size;
 
 	/*
 	 * Mark the buffer containing the inode as something to keep
@@ -1041,6 +1051,7 @@ xfs_ialloc(
 	}
 
 	ip->i_d.di_size = 0;
+	ip->i_size = 0;
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
 
@@ -1154,6 +1165,52 @@ xfs_ialloc(
 	return 0;
 }
 
+/*
+ * Check to make sure that there are no blocks allocated to the
+ * file beyond the size of the file.  We don't check this for
+ * files with fixed size extents or real time extents, but we
+ * at least do it for regular files.
+ */
+#ifdef DEBUG
+STATIC void
+xfs_isize_check(
+	struct xfs_inode	*ip,
+	xfs_fsize_t		isize)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		map_first;
+	int			nimaps;
+	xfs_bmbt_irec_t		imaps[2];
+	int			error;
+
+	if (!S_ISREG(ip->i_d.di_mode))
+		return;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		return;
+
+	if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+		return;
+
+	nimaps = 2;
+	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	/*
+	 * The filesystem could be shutting down, so bmapi may return
+	 * an error.
+	 */
+	error = xfs_bmapi_read(ip, map_first,
+			 (XFS_B_TO_FSB(mp,
+			       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
+			 imaps, &nimaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		return;
+	ASSERT(nimaps == 1);
+	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+}
+#else	/* DEBUG */
+#define xfs_isize_check(ip, isize)
+#endif	/* DEBUG */
+
 /*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
@@ -1195,14 +1252,12 @@ xfs_itruncate_extents(
 	int			done = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-	ASSERT(new_size <= XFS_ISIZE(ip));
+	ASSERT(new_size <= ip->i_size);
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
-	trace_xfs_itruncate_extents_start(ip, new_size);
-
 	/*
 	 * Since it is possible for space to become allocated beyond
 	 * the end of the file (in a crash where the space is allocated
@@ -1270,14 +1325,6 @@ xfs_itruncate_extents(
 			goto out;
 	}
 
-	/*
-	 * Always re-log the inode so that our permanent transaction can keep
-	 * on rolling it forward in the log.
-	 */
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	trace_xfs_itruncate_extents_end(ip, new_size);
-
 out:
 	*tpp = tp;
 	return error;
@@ -1291,6 +1338,74 @@ xfs_itruncate_extents(
 	goto out;
 }
 
+int
+xfs_itruncate_data(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	xfs_fsize_t		new_size)
+{
+	int			error;
+
+	trace_xfs_itruncate_data_start(ip, new_size);
+
+	/*
+	 * The first thing we do is set the size to new_size permanently on
+	 * disk.  This way we don't have to worry about anyone ever being able
+	 * to look at the data being freed even in the face of a crash.
+	 * What we're getting around here is the case where we free a block, it
+	 * is allocated to another file, it is written to, and then we crash.
+	 * If the new data gets written to the file but the log buffers
+	 * containing the free and reallocation don't, then we'd end up with
+	 * garbage in the blocks being freed.  As long as we make the new_size
+	 * permanent before actually freeing any blocks it doesn't matter if
+	 * they get written to.
+	 */
+	if (ip->i_d.di_nextents > 0) {
+		/*
+		 * If we are not changing the file size then do not update
+		 * the on-disk file size - we may be called from
+		 * xfs_inactive_free_eofblocks().  If we update the on-disk
+		 * file size and then the system crashes before the contents
+		 * of the file are flushed to disk then the files may be
+		 * full of holes (ie NULL files bug).
+		 */
+		if (ip->i_size != new_size) {
+			ip->i_d.di_size = new_size;
+			ip->i_size = new_size;
+			xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+		}
+	}
+
+	error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
+	if (error)
+		return error;
+
+	/*
+	 * If we are not changing the file size then do not update the on-disk
+	 * file size - we may be called from xfs_inactive_free_eofblocks().
+	 * If we update the on-disk file size and then the system crashes
+	 * before the contents of the file are flushed to disk then the files
+	 * may be full of holes (ie NULL files bug).
+	 */
+	xfs_isize_check(ip, new_size);
+	if (ip->i_size != new_size) {
+		ip->i_d.di_size = new_size;
+		ip->i_size = new_size;
+	}
+
+	ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
+	ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
+
+	/*
+	 * Always re-log the inode so that our permanent transaction can keep
+	 * on rolling it forward in the log.
+	 */
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+
+	trace_xfs_itruncate_data_end(ip, new_size);
+	return 0;
+}
+
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
@@ -1709,7 +1824,8 @@ xfs_ifree(
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
-	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
+	ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
+	       (!S_ISREG(ip->i_d.di_mode)));
 	ASSERT(ip->i_d.di_nblocks == 0);
 
 	/*
@@ -1728,6 +1844,8 @@ xfs_ifree(
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	/*
@@ -2033,7 +2151,7 @@ xfs_idestroy_fork(
  * once someone is waiting for it to be unpinned.
  */
 static void
-xfs_iunpin(
+xfs_iunpin_nowait(
 	struct xfs_inode	*ip)
 {
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2045,29 +2163,14 @@ xfs_iunpin(
 
 }
 
-static void
-__xfs_iunpin_wait(
-	struct xfs_inode	*ip)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
-	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
-
-	xfs_iunpin(ip);
-
-	do {
-		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-		if (xfs_ipincount(ip))
-			io_schedule();
-	} while (xfs_ipincount(ip));
-	finish_wait(wq, &wait.wait);
-}
-
 void
 xfs_iunpin_wait(
 	struct xfs_inode	*ip)
 {
-	if (xfs_ipincount(ip))
-		__xfs_iunpin_wait(ip);
+	if (xfs_ipincount(ip)) {
+		xfs_iunpin_nowait(ip);
+		wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
+	}
 }
 
 /*
@@ -2407,9 +2510,9 @@ xfs_iflush(
 	XFS_STATS_INC(xs_iflush_count);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(xfs_isiflocked(ip));
+	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
@@ -2426,7 +2529,7 @@ xfs_iflush(
 	 * out for us if they occur after the log force completes.
 	 */
 	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
-		xfs_iunpin(ip);
+		xfs_iunpin_nowait(ip);
 		xfs_ifunlock(ip);
 		return EAGAIN;
 	}
@@ -2523,9 +2626,9 @@ xfs_iflush_int(
 #endif
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(xfs_isiflocked(ip));
+	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
diff --git a/trunk/fs/xfs/xfs_inode.h b/trunk/fs/xfs/xfs_inode.h
index 2f27b7454085..f0e6b151ba37 100644
--- a/trunk/fs/xfs/xfs_inode.h
+++ b/trunk/fs/xfs/xfs_inode.h
@@ -66,6 +66,7 @@ typedef struct xfs_ifork {
 	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */
+	unsigned char		if_ext_max;	/* max # of extent records */
 	union {
 		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
 		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
@@ -205,12 +206,12 @@ typedef struct xfs_icdinode {
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_nextents = (n)) : \
 		((ip)->i_d.di_anextents = (n)))
-#define XFS_IFORK_MAXEXT(ip, w) \
-	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+
 
 
 #ifdef __KERNEL__
 
+struct bhv_desc;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -219,6 +220,12 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 
+typedef struct dm_attrs_s {
+	__uint32_t	da_dmevmask;	/* DMIG event mask */
+	__uint16_t	da_dmstate;	/* DMIG state info */
+	__uint16_t	da_pad;		/* DMIG extra padding */
+} dm_attrs_t;
+
 typedef struct xfs_inode {
 	/* Inode linking and identification information. */
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
@@ -237,19 +244,27 @@ typedef struct xfs_inode {
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
+	struct completion	i_flush;	/* inode flush completion q */
 	atomic_t		i_pincount;	/* inode pin count */
+	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
-	unsigned long		i_flags;	/* see defined flags below */
+	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
 
+	xfs_fsize_t		i_size;		/* in-memory size */
+	xfs_fsize_t		i_new_size;	/* size when write completes */
+
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
 
+#define XFS_ISIZE(ip)	S_ISREG((ip)->i_d.di_mode) ? \
+				(ip)->i_size : (ip)->i_d.di_size;
+
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -262,18 +277,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 	return &ip->i_vnode;
 }
 
-/*
- * For regular files we only update the on-disk filesize when actually
- * writing data back to disk.  Until then only the copy in the VFS inode
- * is uptodate.
- */
-static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
-{
-	if (S_ISREG(ip->i_d.di_mode))
-		return i_size_read(VFS_I(ip));
-	return ip->i_d.di_size;
-}
-
 /*
  * i_flags helper functions
  */
@@ -328,19 +331,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 	return ret;
 }
 
-static inline int
-xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
-{
-	int ret;
-
-	spin_lock(&ip->i_flags_lock);
-	ret = ip->i_flags & flags;
-	if (!ret)
-		ip->i_flags |= flags;
-	spin_unlock(&ip->i_flags_lock);
-	return ret;
-}
-
 /*
  * Project quota id helpers (previously projid was 16bit only
  * and using two 16bit values to hold new 32bit projid was chosen
@@ -360,20 +350,36 @@ xfs_set_projid(struct xfs_inode *ip,
 	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
 }
 
+/*
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+	wait_for_completion(&ip->i_flush);
+}
+
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+	return try_wait_for_completion(&ip->i_flush);
+}
+
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+	complete(&ip->i_flush);
+}
+
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM		(1 << 0) /* started reclaiming this inode */
-#define XFS_ISTALE		(1 << 1) /* inode has been staled */
-#define XFS_IRECLAIMABLE	(1 << 2) /* inode can be reclaimed */
-#define XFS_INEW		(1 << 3) /* inode has just been allocated */
-#define XFS_IFILESTREAM		(1 << 4) /* inode is in a filestream dir. */
-#define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
-#define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
-#define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
-#define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
-#define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
+#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
+#define XFS_ISTALE		0x0002	/* inode has been staled */
+#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
+#define XFS_INEW		0x0008	/* inode has just been allocated */
+#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
+#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -385,34 +391,6 @@ xfs_set_projid(struct xfs_inode *ip,
 	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
 	 XFS_IFILESTREAM);
 
-/*
- * Synchronize processes attempting to flush the in-core inode back to disk.
- */
-
-extern void __xfs_iflock(struct xfs_inode *ip);
-
-static inline int xfs_iflock_nowait(struct xfs_inode *ip)
-{
-	return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
-}
-
-static inline void xfs_iflock(struct xfs_inode *ip)
-{
-	if (!xfs_iflock_nowait(ip))
-		__xfs_iflock(ip);
-}
-
-static inline void xfs_ifunlock(struct xfs_inode *ip)
-{
-	xfs_iflags_clear(ip, XFS_IFLOCK);
-	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
-}
-
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
-	return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
 /*
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
@@ -513,6 +491,8 @@ int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
 int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
 				      int, xfs_fsize_t);
+int		xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
+				   xfs_fsize_t);
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/trunk/fs/xfs/xfs_inode_item.c b/trunk/fs/xfs/xfs_inode_item.c
index 91d71dcd4852..cfd6c7f8cc3c 100644
--- a/trunk/fs/xfs/xfs_inode_item.c
+++ b/trunk/fs/xfs/xfs_inode_item.c
@@ -79,6 +79,8 @@ xfs_inode_item_size(
 		break;
 
 	case XFS_DINODE_FMT_BTREE:
+		ASSERT(ip->i_df.if_ext_max ==
+		       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 		iip->ili_format.ilf_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
 			  XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -555,7 +557,7 @@ xfs_inode_item_unpin(
 	trace_xfs_inode_unpin(ip, _RET_IP_);
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 	if (atomic_dec_and_test(&ip->i_pincount))
-		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+		wake_up(&ip->i_ipin_wait);
 }
 
 /*
@@ -717,7 +719,7 @@ xfs_inode_item_pushbuf(
 	 * If a flush is not in progress anymore, chances are that the
 	 * inode was taken off the AIL. So, just get out.
 	 */
-	if (!xfs_isiflocked(ip) ||
+	if (completion_done(&ip->i_flush) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return true;
@@ -750,7 +752,7 @@ xfs_inode_item_push(
 	struct xfs_inode	*ip = iip->ili_inode;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-	ASSERT(xfs_isiflocked(ip));
+	ASSERT(!completion_done(&ip->i_flush));
 
 	/*
 	 * Since we were able to lock the inode's flush lock and
diff --git a/trunk/fs/xfs/xfs_iomap.c b/trunk/fs/xfs/xfs_iomap.c
index 246c7d57c6f9..9afa282aa937 100644
--- a/trunk/fs/xfs/xfs_iomap.c
+++ b/trunk/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
 	xfs_fileoff_t	*last_fsb)
 {
 	xfs_fileoff_t	new_last_fsb = 0;
-	xfs_extlen_t	align = 0;
+	xfs_extlen_t	align;
 	int		eof, error;
 
-	if (!XFS_IS_REALTIME_INODE(ip)) {
-		/*
-		 * Round up the allocation request to a stripe unit
-		 * (m_dalign) boundary if the file size is >= stripe unit
-		 * size, and we are allocating past the allocation eof.
-		 *
-		 * If mounted with the "-o swalloc" option the alignment is
-		 * increased from the strip unit size to the stripe width.
-		 */
-		if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
-			align = mp->m_swidth;
-		else if (mp->m_dalign)
-			align = mp->m_dalign;
-
-		if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
-			new_last_fsb = roundup_64(*last_fsb, align);
-	}
+	if (XFS_IS_REALTIME_INODE(ip))
+		;
+	/*
+	 * If mounted with the "-o swalloc" option, roundup the allocation
+	 * request to a stripe width boundary if the file size is >=
+	 * stripe width and we are allocating past the allocation eof.
+	 */
+	else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
+	        (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+		new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
+	/*
+	 * Roundup the allocation request to a stripe unit (m_dalign) boundary
+	 * if the file size is >= stripe unit size, and we are allocating past
+	 * the allocation eof.
+	 */
+	else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+		new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
 
 	/*
 	 * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-	if ((offset + count) > XFS_ISIZE(ip)) {
+	if ((offset + count) > ip->i_size) {
 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
 		if (error)
 			goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	bmapi_flag = 0;
-	if (offset < XFS_ISIZE(ip) || extsz)
+	if (offset < ip->i_size || extsz)
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
 	int		found_delalloc = 0;
 
 	*prealloc = 0;
-	if (offset + count <= XFS_ISIZE(ip))
+	if ((offset + count) <= ip->i_size)
 		return 0;
 
 	/*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
 		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
 		 * ensure we always pass in a non-zero value.
 		 */
-		alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
 		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
 					rounddown_pow_of_two(alloc_blocks));
 
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
 			 * back....
 			 */
 			nimaps = 1;
-			end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
 			error = xfs_bmap_last_offset(NULL, ip, &last_block,
 							XFS_DATA_FORK);
 			if (error)
diff --git a/trunk/fs/xfs/xfs_iops.c b/trunk/fs/xfs/xfs_iops.c
index ab302539e5b9..f9babd179223 100644
--- a/trunk/fs/xfs/xfs_iops.c
+++ b/trunk/fs/xfs/xfs_iops.c
@@ -750,7 +750,6 @@ xfs_setattr_size(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct inode		*inode = VFS_I(ip);
 	int			mask = iattr->ia_valid;
-	xfs_off_t		oldsize, newsize;
 	struct xfs_trans	*tp;
 	int			error;
 	uint			lock_flags;
@@ -778,13 +777,11 @@ xfs_setattr_size(
 		lock_flags |= XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, lock_flags);
 
-	oldsize = inode->i_size;
-	newsize = iattr->ia_size;
-
 	/*
 	 * Short circuit the truncate case for zero length files.
 	 */
-	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
+	if (iattr->ia_size == 0 &&
+	    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
 		if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
 			goto out_unlock;
 
@@ -810,14 +807,14 @@ xfs_setattr_size(
 	 * the inode to the transaction, because the inode cannot be unlocked
 	 * once it is a part of the transaction.
 	 */
-	if (newsize > oldsize) {
+	if (iattr->ia_size > ip->i_size) {
 		/*
 		 * Do the first part of growing a file: zero any data in the
 		 * last block that is beyond the old EOF.  We need to do this
 		 * before the inode is joined to the transaction to modify
 		 * i_size.
 		 */
-		error = xfs_zero_eof(ip, newsize, oldsize);
+		error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
 		if (error)
 			goto out_unlock;
 	}
@@ -836,8 +833,8 @@ xfs_setattr_size(
 	 * here and prevents waiting for other data not within the range we
 	 * care about here.
 	 */
-	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
+	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
+		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
 					FI_NONE);
 		if (error)
 			goto out_unlock;
@@ -848,7 +845,8 @@ xfs_setattr_size(
 	 */
 	inode_dio_wait(inode);
 
-	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+				     xfs_get_blocks);
 	if (error)
 		goto out_unlock;
 
@@ -859,7 +857,7 @@ xfs_setattr_size(
 	if (error)
 		goto out_trans_cancel;
 
-	truncate_setsize(inode, newsize);
+	truncate_setsize(inode, iattr->ia_size);
 
 	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 	lock_flags |= XFS_ILOCK_EXCL;
@@ -878,29 +876,19 @@ xfs_setattr_size(
 	 * these flags set.  For all other operations the VFS set these flags
 	 * explicitly if it wants a timestamp update.
 	 */
-	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+	if (iattr->ia_size != ip->i_size &&
+	    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
 		iattr->ia_ctime = iattr->ia_mtime =
 			current_fs_time(inode->i_sb);
 		mask |= ATTR_CTIME | ATTR_MTIME;
 	}
 
-	/*
-	 * The first thing we do is set the size to new_size permanently on
-	 * disk.  This way we don't have to worry about anyone ever being able
-	 * to look at the data being freed even in the face of a crash.
-	 * What we're getting around here is the case where we free a block, it
-	 * is allocated to another file, it is written to, and then we crash.
-	 * If the new data gets written to the file but the log buffers
-	 * containing the free and reallocation don't, then we'd end up with
-	 * garbage in the blocks being freed.  As long as we make the new size
-	 * permanent before actually freeing any blocks it doesn't matter if
-	 * they get written to.
-	 */
-	ip->i_d.di_size = newsize;
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	if (newsize <= oldsize) {
-		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
+	if (iattr->ia_size > ip->i_size) {
+		ip->i_d.di_size = iattr->ia_size;
+		ip->i_size = iattr->ia_size;
+	} else if (iattr->ia_size <= ip->i_size ||
+		   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+		error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
 		if (error)
 			goto out_trans_abort;
 
diff --git a/trunk/fs/xfs/xfs_qm_syscalls.c b/trunk/fs/xfs/xfs_qm_syscalls.c
index eafbcff81f3a..5cc3dde1bc90 100644
--- a/trunk/fs/xfs/xfs_qm_syscalls.c
+++ b/trunk/fs/xfs/xfs_qm_syscalls.c
@@ -31,7 +31,6 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -264,18 +263,13 @@ xfs_qm_scall_trunc_qfile(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	ip->i_d.di_size = 0;
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+	error = xfs_itruncate_data(&tp, ip, 0);
 	if (error) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				     XFS_TRANS_ABORT);
 		goto out_unlock;
 	}
 
-	ASSERT(ip->i_d.di_nextents == 0);
-
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
diff --git a/trunk/fs/xfs/xfs_super.c b/trunk/fs/xfs/xfs_super.c
index ee5b695c99a7..281961c1d81a 100644
--- a/trunk/fs/xfs/xfs_super.c
+++ b/trunk/fs/xfs/xfs_super.c
@@ -828,6 +828,14 @@ xfs_fs_inode_init_once(
 	/* xfs inode */
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
+	init_waitqueue_head(&ip->i_ipin_wait);
+	/*
+	 * Because we want to use a counting completion, complete
+	 * the flush completion once to allow a single access to
+	 * the flush completion without blocking.
+	 */
+	init_completion(&ip->i_flush);
+	complete(&ip->i_flush);
 
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
diff --git a/trunk/fs/xfs/xfs_sync.c b/trunk/fs/xfs/xfs_sync.c
index 40b75eecd2b4..72c01a1c16e7 100644
--- a/trunk/fs/xfs/xfs_sync.c
+++ b/trunk/fs/xfs/xfs_sync.c
@@ -707,13 +707,14 @@ xfs_reclaim_inode_grab(
 		return 1;
 
 	/*
-	 * If we are asked for non-blocking operation, do unlocked checks to
-	 * see if the inode already is being flushed or in reclaim to avoid
-	 * lock traffic.
+	 * do some unlocked checks first to avoid unnecessary lock traffic.
+	 * The first is a flush lock check, the second is a already in reclaim
+	 * check. Only do these checks if we are not going to block on locks.
 	 */
 	if ((flags & SYNC_TRYLOCK) &&
-	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
 		return 1;
+	}
 
 	/*
 	 * The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/trunk/fs/xfs/xfs_trace.h b/trunk/fs/xfs/xfs_trace.h
index 6b6df5802e95..a9d5b1e06efe 100644
--- a/trunk/fs/xfs/xfs_trace.h
+++ b/trunk/fs/xfs/xfs_trace.h
@@ -891,6 +891,7 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 		__field(int, flags)
@@ -899,15 +900,17 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 		__entry->flags = flags;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
 		  "offset 0x%llx count 0x%zx ioflags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
+		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
 		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -975,6 +978,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(loff_t, size)
+		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 		__field(int, type)
@@ -986,6 +990,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 		__entry->type = type;
@@ -993,11 +998,13 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->startblock = irec ? irec->br_startblock : 0;
 		__entry->blockcount = irec ? irec->br_blockcount : 0;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
-		  "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd type %s "
+		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
+		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
 		  __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1024,23 +1031,26 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__field(xfs_ino_t, ino)
 		__field(loff_t, isize)
 		__field(loff_t, disize)
+		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
-		__entry->isize = VFS_I(ip)->i_size;
+		__entry->isize = ip->i_size;
 		__entry->disize = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 	),
-	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
+	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
 		  "offset 0x%llx count %zd",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->isize,
 		  __entry->disize,
+		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count)
 );
@@ -1080,8 +1090,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
 DEFINE_EVENT(xfs_itrunc_class, name, \
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
 	TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
 
 TRACE_EVENT(xfs_pagecache_inval,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1558,6 +1568,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__field(xfs_ino_t, ino)
 		__field(int, format)
 		__field(int, nex)
+		__field(int, max_nex)
 		__field(int, broot_size)
 		__field(int, fork_off)
 	),
@@ -1567,16 +1578,18 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__entry->ino = ip->i_ino;
 		__entry->format = ip->i_d.di_format;
 		__entry->nex = ip->i_d.di_nextents;
+		__entry->max_nex = ip->i_df.if_ext_max;
 		__entry->broot_size = ip->i_df.if_broot_bytes;
 		__entry->fork_off = XFS_IFORK_BOFF(ip);
 	),
 	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-		  "broot size %d, fork offset %d",
+		  "Max in-fork extents %d, broot size %d, fork offset %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
 		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
 		  __entry->nex,
+		  __entry->max_nex,
 		  __entry->broot_size,
 		  __entry->fork_off)
 )
diff --git a/trunk/fs/xfs/xfs_vnodeops.c b/trunk/fs/xfs/xfs_vnodeops.c
index 0cf52da9d246..f2fea868d4db 100644
--- a/trunk/fs/xfs/xfs_vnodeops.c
+++ b/trunk/fs/xfs/xfs_vnodeops.c
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
 	 * Figure out if there are any blocks beyond the end
 	 * of the file.  If not, then there is nothing to do.
 	 */
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
 	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 	if (last_fsb <= end_fsb)
 		return 0;
@@ -226,14 +226,7 @@ xfs_free_eofblocks(
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
 
-		/*
-		 * Do not update the on-disk file size.  If we update the
-		 * on-disk file size and then the system crashes before the
-		 * contents of the file are flushed to disk then the files
-		 * may be full of holes (ie NULL files bug).
-		 */
-		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
-					      XFS_ISIZE(ip));
+		error = xfs_itruncate_data(&tp, ip, ip->i_size);
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
@@ -547,8 +540,8 @@ xfs_release(
 		return 0;
 
 	if ((S_ISREG(ip->i_d.di_mode) &&
-	     (VFS_I(ip)->i_size > 0 ||
-	      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+	       ip->i_delayed_blks > 0)) &&
 	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
@@ -625,7 +618,7 @@ xfs_inactive(
 	 * only one with a reference to the inode.
 	 */
 	truncate = ((ip->i_d.di_nlink == 0) &&
-	    ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
+	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
 	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
 	    S_ISREG(ip->i_d.di_mode));
 
@@ -639,12 +632,12 @@ xfs_inactive(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((S_ISREG(ip->i_d.di_mode) &&
-		    (VFS_I(ip)->i_size > 0 ||
-		     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-		    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-		    (!(ip->i_d.di_flags &
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+                       ip->i_delayed_blks > 0)) &&
+		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+		     (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-		     ip->i_delayed_blks != 0))) {
+		      (ip->i_delayed_blks != 0)))) {
 			error = xfs_free_eofblocks(mp, ip, 0);
 			if (error)
 				return VN_INACTIVE_CACHE;
@@ -677,18 +670,13 @@ xfs_inactive(
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
 
-		ip->i_d.di_size = 0;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+		error = xfs_itruncate_data(&tp, ip, 0);
 		if (error) {
 			xfs_trans_cancel(tp,
 				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 			return VN_INACTIVE_CACHE;
 		}
-
-		ASSERT(ip->i_d.di_nextents == 0);
 	} else if (S_ISLNK(ip->i_d.di_mode)) {
 
 		/*
@@ -1973,11 +1961,11 @@ xfs_zero_remaining_bytes(
 	 * since nothing can read beyond eof.  The space will
 	 * be zeroed when the file is extended anyway.
 	 */
-	if (startoff >= XFS_ISIZE(ip))
+	if (startoff >= ip->i_size)
 		return 0;
 
-	if (endoff > XFS_ISIZE(ip))
-		endoff = XFS_ISIZE(ip);
+	if (endoff > ip->i_size)
+		endoff = ip->i_size;
 
 	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2272,7 +2260,7 @@ xfs_change_file_space(
 		bf->l_start += offset;
 		break;
 	case 2: /*SEEK_END*/
-		bf->l_start += XFS_ISIZE(ip);
+		bf->l_start += ip->i_size;
 		break;
 	default:
 		return XFS_ERROR(EINVAL);
@@ -2289,7 +2277,7 @@ xfs_change_file_space(
 	bf->l_whence = 0;
 
 	startoffset = bf->l_start;
-	fsize = XFS_ISIZE(ip);
+	fsize = ip->i_size;
 
 	/*
 	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
diff --git a/trunk/include/linux/audit.h b/trunk/include/linux/audit.h
index 9ff7a2c48b50..426ab9f4dd85 100644
--- a/trunk/include/linux/audit.h
+++ b/trunk/include/linux/audit.h
@@ -26,7 +26,6 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
-#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
@@ -182,40 +181,6 @@
  * AUDIT_UNUSED_BITS is updated if need be. */
 #define AUDIT_UNUSED_BITS	0x07FFFC00
 
-/* AUDIT_FIELD_COMPARE rule list */
-#define AUDIT_COMPARE_UID_TO_OBJ_UID	1
-#define AUDIT_COMPARE_GID_TO_OBJ_GID	2
-#define AUDIT_COMPARE_EUID_TO_OBJ_UID	3
-#define AUDIT_COMPARE_EGID_TO_OBJ_GID	4
-#define AUDIT_COMPARE_AUID_TO_OBJ_UID	5
-#define AUDIT_COMPARE_SUID_TO_OBJ_UID	6
-#define AUDIT_COMPARE_SGID_TO_OBJ_GID	7
-#define AUDIT_COMPARE_FSUID_TO_OBJ_UID	8
-#define AUDIT_COMPARE_FSGID_TO_OBJ_GID	9
-
-#define AUDIT_COMPARE_UID_TO_AUID	10
-#define AUDIT_COMPARE_UID_TO_EUID	11
-#define AUDIT_COMPARE_UID_TO_FSUID	12
-#define AUDIT_COMPARE_UID_TO_SUID	13
-
-#define AUDIT_COMPARE_AUID_TO_FSUID	14
-#define AUDIT_COMPARE_AUID_TO_SUID	15
-#define AUDIT_COMPARE_AUID_TO_EUID	16
-
-#define AUDIT_COMPARE_EUID_TO_SUID	17
-#define AUDIT_COMPARE_EUID_TO_FSUID	18
-
-#define AUDIT_COMPARE_SUID_TO_FSUID	19
-
-#define AUDIT_COMPARE_GID_TO_EGID	20
-#define AUDIT_COMPARE_GID_TO_FSGID	21
-#define AUDIT_COMPARE_GID_TO_SGID	22
-
-#define AUDIT_COMPARE_EGID_TO_FSGID	23
-#define AUDIT_COMPARE_EGID_TO_SGID	24
-#define AUDIT_COMPARE_SGID_TO_FSGID	25
-
-#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_SGID_TO_FSGID
 
 /* Rule fields */
 				/* These are useful when checking the
@@ -257,9 +222,6 @@
 #define AUDIT_PERM	106
 #define AUDIT_DIR	107
 #define AUDIT_FILETYPE	108
-#define AUDIT_OBJ_UID	109
-#define AUDIT_OBJ_GID	110
-#define AUDIT_FIELD_COMPARE	111
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -446,24 +408,28 @@ struct audit_field {
 	void				*lsm_rule;
 };
 
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
 				/* Public API */
+extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
-extern void __audit_free(struct task_struct *task);
-extern void __audit_syscall_entry(int arch,
-				  int major, unsigned long a0, unsigned long a1,
-				  unsigned long a2, unsigned long a3);
-extern void __audit_syscall_exit(int ret_success, long ret_value);
+extern void audit_free(struct task_struct *task);
+extern void audit_syscall_entry(int arch,
+				int major, unsigned long a0, unsigned long a1,
+				unsigned long a2, unsigned long a3);
+extern void audit_syscall_exit(int failed, long return_code);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
 extern void __audit_inode_child(const struct dentry *dentry,
 				const struct inode *parent);
-extern void __audit_seccomp(unsigned long syscall);
 extern void __audit_ptrace(struct task_struct *t);
 
 static inline int audit_dummy_context(void)
@@ -471,27 +437,6 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
-static inline void audit_free(struct task_struct *task)
-{
-	if (unlikely(task->audit_context))
-		__audit_free(task);
-}
-static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
-				       unsigned long a1, unsigned long a2,
-				       unsigned long a3)
-{
-	if (unlikely(!audit_dummy_context()))
-		__audit_syscall_entry(arch, major, a0, a1, a2, a3);
-}
-static inline void audit_syscall_exit(void *pt_regs)
-{
-	if (unlikely(current->audit_context)) {
-		int success = is_syscall_success(pt_regs);
-		int return_code = regs_return_value(pt_regs);
-
-		__audit_syscall_exit(success, return_code);
-	}
-}
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -508,12 +453,6 @@ static inline void audit_inode_child(const struct dentry *dentry,
 }
 void audit_core_dumps(long signr);
 
-static inline void audit_seccomp(unsigned long syscall)
-{
-	if (unlikely(!audit_dummy_context()))
-		__audit_seccomp(syscall);
-}
-
 static inline void audit_ptrace(struct task_struct *t)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -524,16 +463,17 @@ static inline void audit_ptrace(struct task_struct *t)
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
-extern int  audit_set_loginuid(uid_t loginuid);
+extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
-extern int __audit_bprm(struct linux_binprm *bprm);
-extern void __audit_socketcall(int nargs, unsigned long *args);
-extern int __audit_sockaddr(int len, void *addr);
+extern int audit_bprm(struct linux_binprm *bprm);
+extern void audit_socketcall(int nargs, unsigned long *args);
+extern int audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
+extern int audit_set_macxattr(const char *name);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
 extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
@@ -559,23 +499,6 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
-static inline int audit_bprm(struct linux_binprm *bprm)
-{
-	if (unlikely(!audit_dummy_context()))
-		return __audit_bprm(bprm);
-	return 0;
-}
-static inline void audit_socketcall(int nargs, unsigned long *args)
-{
-	if (unlikely(!audit_dummy_context()))
-		__audit_socketcall(nargs, args);
-}
-static inline int audit_sockaddr(int len, void *addr)
-{
-	if (unlikely(!audit_dummy_context()))
-		return __audit_sockaddr(len, addr);
-	return 0;
-}
 static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -621,11 +544,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else /* CONFIG_AUDITSYSCALL */
+#else
+#define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(r) do { ; } while (0)
+#define audit_syscall_exit(f,r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -634,7 +558,6 @@ extern int audit_signals;
 #define audit_inode(n,d) do { (void)(d); } while (0)
 #define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
-#define audit_seccomp(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
@@ -645,6 +568,7 @@ extern int audit_signals;
 #define audit_socketcall(n,a) ((void)0)
 #define audit_fd_pair(n,a) ((void)0)
 #define audit_sockaddr(len, addr) ({ 0; })
+#define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ((void)0)
 #define audit_mq_sendrecv(d,l,p,t) ((void)0)
 #define audit_mq_notify(d,n) ((void)0)
@@ -655,7 +579,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif /* CONFIG_AUDITSYSCALL */
+#endif
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
diff --git a/trunk/include/linux/kref.h b/trunk/include/linux/kref.h
index 9c07dcebded7..abc0120b09b7 100644
--- a/trunk/include/linux/kref.h
+++ b/trunk/include/linux/kref.h
@@ -17,7 +17,6 @@
 
 #include <linux/bug.h>
 #include <linux/atomic.h>
-#include <linux/kernel.h>
 
 struct kref {
 	atomic_t refcount;
diff --git a/trunk/include/linux/ptrace.h b/trunk/include/linux/ptrace.h
index c2f1f6a5fcb8..a27e56ca41a4 100644
--- a/trunk/include/linux/ptrace.h
+++ b/trunk/include/linux/ptrace.h
@@ -112,7 +112,6 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
-#include <linux/err.h>			/* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -267,15 +266,6 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
-#ifndef is_syscall_success
-/*
- * On most systems we can tell if a syscall is a success based on if the retval
- * is an error value.  On some systems like ia64 and powerpc they have different
- * indicators of success/failure and must define their own.
- */
-#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
-#endif
-
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
diff --git a/trunk/include/linux/tty_driver.h b/trunk/include/linux/tty_driver.h
index 5cf685086dd3..ecdaeb98b293 100644
--- a/trunk/include/linux/tty_driver.h
+++ b/trunk/include/linux/tty_driver.h
@@ -312,6 +312,7 @@ struct tty_driver {
 	 */
 	struct tty_struct **ttys;
 	struct ktermios **termios;
+	struct ktermios **termios_locked;
 	void *driver_state;
 
 	/*
diff --git a/trunk/include/trace/events/btrfs.h b/trunk/include/trace/events/btrfs.h
index 84f3001a568d..b31702ac15be 100644
--- a/trunk/include/trace/events/btrfs.h
+++ b/trunk/include/trace/events/btrfs.h
@@ -16,8 +16,6 @@ struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
-struct btrfs_block_group_cache;
-struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
 
@@ -46,17 +44,6 @@ struct extent_buffer;
 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
 	      (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
 
-#define BTRFS_GROUP_FLAGS	\
-	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"}, \
-	{ BTRFS_BLOCK_GROUP_SYSTEM,	"SYSTEM"}, \
-	{ BTRFS_BLOCK_GROUP_METADATA,	"METADATA"}, \
-	{ BTRFS_BLOCK_GROUP_RAID0,	"RAID0"}, \
-	{ BTRFS_BLOCK_GROUP_RAID1,	"RAID1"}, \
-	{ BTRFS_BLOCK_GROUP_DUP,	"DUP"}, \
-	{ BTRFS_BLOCK_GROUP_RAID10,	"RAID10"}
-
-#define BTRFS_UUID_SIZE 16
-
 TRACE_EVENT(btrfs_transaction_commit,
 
 	TP_PROTO(struct btrfs_root *root),
@@ -634,34 +621,6 @@ TRACE_EVENT(btrfs_cow_block,
 		  __entry->cow_level)
 );
 
-TRACE_EVENT(btrfs_space_reservation,
-
-	TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
-		 u64 bytes, int reserve),
-
-	TP_ARGS(fs_info, type, val, bytes, reserve),
-
-	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
-		__string(	type,	type			)
-		__field(	u64,	val			)
-		__field(	u64,	bytes			)
-		__field(	int,	reserve			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
-		__assign_str(type, type);
-		__entry->val		= val;
-		__entry->bytes		= bytes;
-		__entry->reserve	= reserve;
-	),
-
-	TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
-		  __entry->val, __entry->reserve ? "reserve" : "release",
-		  __entry->bytes)
-);
-
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 	TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
@@ -700,168 +659,6 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
 	TP_ARGS(root, start, len)
 );
 
-TRACE_EVENT(find_free_extent,
-
-	TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size,
-		 u64 data),
-
-	TP_ARGS(root, num_bytes, empty_size, data),
-
-	TP_STRUCT__entry(
-		__field(	u64,	root_objectid		)
-		__field(	u64,	num_bytes		)
-		__field(	u64,	empty_size		)
-		__field(	u64,	data			)
-	),
-
-	TP_fast_assign(
-		__entry->root_objectid	= root->root_key.objectid;
-		__entry->num_bytes	= num_bytes;
-		__entry->empty_size	= empty_size;
-		__entry->data		= data;
-	),
-
-	TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
-		  "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
-		  __entry->num_bytes, __entry->empty_size, __entry->data,
-		  __print_flags((unsigned long)__entry->data, "|",
-				 BTRFS_GROUP_FLAGS))
-);
-
-DECLARE_EVENT_CLASS(btrfs__reserve_extent,
-
-	TP_PROTO(struct btrfs_root *root,
-		 struct btrfs_block_group_cache *block_group, u64 start,
-		 u64 len),
-
-	TP_ARGS(root, block_group, start, len),
-
-	TP_STRUCT__entry(
-		__field(	u64,	root_objectid		)
-		__field(	u64,	bg_objectid		)
-		__field(	u64,	flags			)
-		__field(	u64,	start			)
-		__field(	u64,	len			)
-	),
-
-	TP_fast_assign(
-		__entry->root_objectid	= root->root_key.objectid;
-		__entry->bg_objectid	= block_group->key.objectid;
-		__entry->flags		= block_group->flags;
-		__entry->start		= start;
-		__entry->len		= len;
-	),
-
-	TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
-		  "start = %Lu, len = %Lu",
-		  show_root_type(__entry->root_objectid), __entry->bg_objectid,
-		  __entry->flags, __print_flags((unsigned long)__entry->flags,
-						"|", BTRFS_GROUP_FLAGS),
-		  __entry->start, __entry->len)
-);
-
-DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
-
-	TP_PROTO(struct btrfs_root *root,
-		 struct btrfs_block_group_cache *block_group, u64 start,
-		 u64 len),
-
-	TP_ARGS(root, block_group, start, len)
-);
-
-DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
-
-	TP_PROTO(struct btrfs_root *root,
-		 struct btrfs_block_group_cache *block_group, u64 start,
-		 u64 len),
-
-	TP_ARGS(root, block_group, start, len)
-);
-
-TRACE_EVENT(btrfs_find_cluster,
-
-	TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
-		 u64 bytes, u64 empty_size, u64 min_bytes),
-
-	TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
-
-	TP_STRUCT__entry(
-		__field(	u64,	bg_objectid		)
-		__field(	u64,	flags			)
-		__field(	u64,	start			)
-		__field(	u64,	bytes			)
-		__field(	u64,	empty_size		)
-		__field(	u64,	min_bytes		)
-	),
-
-	TP_fast_assign(
-		__entry->bg_objectid	= block_group->key.objectid;
-		__entry->flags		= block_group->flags;
-		__entry->start		= start;
-		__entry->bytes		= bytes;
-		__entry->empty_size	= empty_size;
-		__entry->min_bytes	= min_bytes;
-	),
-
-	TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
-		  " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
-		  __entry->flags,
-		  __print_flags((unsigned long)__entry->flags, "|",
-				BTRFS_GROUP_FLAGS), __entry->start,
-		  __entry->bytes, __entry->empty_size,  __entry->min_bytes)
-);
-
-TRACE_EVENT(btrfs_failed_cluster_setup,
-
-	TP_PROTO(struct btrfs_block_group_cache *block_group),
-
-	TP_ARGS(block_group),
-
-	TP_STRUCT__entry(
-		__field(	u64,	bg_objectid		)
-	),
-
-	TP_fast_assign(
-		__entry->bg_objectid	= block_group->key.objectid;
-	),
-
-	TP_printk("block_group = %Lu", __entry->bg_objectid)
-);
-
-TRACE_EVENT(btrfs_setup_cluster,
-
-	TP_PROTO(struct btrfs_block_group_cache *block_group,
-		 struct btrfs_free_cluster *cluster, u64 size, int bitmap),
-
-	TP_ARGS(block_group, cluster, size, bitmap),
-
-	TP_STRUCT__entry(
-		__field(	u64,	bg_objectid		)
-		__field(	u64,	flags			)
-		__field(	u64,	start			)
-		__field(	u64,	max_size		)
-		__field(	u64,	size			)
-		__field(	int,	bitmap			)
-	),
-
-	TP_fast_assign(
-		__entry->bg_objectid	= block_group->key.objectid;
-		__entry->flags		= block_group->flags;
-		__entry->start		= cluster->window_start;
-		__entry->max_size	= cluster->max_size;
-		__entry->size		= size;
-		__entry->bitmap		= bitmap;
-	),
-
-	TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
-		  "size = %Lu, max_size = %Lu, bitmap = %d",
-		  __entry->bg_objectid,
-		  __entry->flags,
-		  __print_flags((unsigned long)__entry->flags, "|",
-				BTRFS_GROUP_FLAGS), __entry->start,
-		  __entry->size, __entry->max_size, __entry->bitmap)
-);
-
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig
index 3f42cd66f0f8..6ac2236244c3 100644
--- a/trunk/init/Kconfig
+++ b/trunk/init/Kconfig
@@ -355,7 +355,7 @@ config AUDIT
 
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
-	depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM)
+	depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that
@@ -372,20 +372,6 @@ config AUDIT_TREE
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 
-config AUDIT_LOGINUID_IMMUTABLE
-	bool "Make audit loginuid immutable"
-	depends on AUDIT
-	help
-	  The config option toggles if a task setting its loginuid requires
-	  CAP_SYS_AUDITCONTROL or if that task should require no special permissions
-	  but should instead only allow setting its loginuid if it was never
-	  previously set.  On systems which use systemd or a similar central
-	  process to restart login services this should be set to true.  On older
-	  systems in which an admin would typically have to directly stop and
-	  start processes this should be set to false.  Setting this to true allows
-	  one to drop potentially dangerous capabilites from the login tasks,
-	  but may not be backwards compatible with older init systems.
-
 source "kernel/irq/Kconfig"
 
 menu "RCU Subsystem"
diff --git a/trunk/kernel/audit.c b/trunk/kernel/audit.c
index bb0eb5bb9a0a..57e3f5107937 100644
--- a/trunk/kernel/audit.c
+++ b/trunk/kernel/audit.c
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 	}
 
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
+	audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
 			 pid, uid, auid, ses);
 	if (sid) {
 		rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	char *p, *pathname;
 
 	if (prefix)
-		audit_log_format(ab, "%s", prefix);
+		audit_log_format(ab, " %s", prefix);
 
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/trunk/kernel/audit.h b/trunk/kernel/audit.h
index 816766803371..91e7071c4d2c 100644
--- a/trunk/kernel/audit.h
+++ b/trunk/kernel/audit.h
@@ -36,8 +36,12 @@ enum audit_state {
 	AUDIT_DISABLED,		/* Do not create per-task audit_context.
 				 * No syscall-specific audit records can
 				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
 	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
-				 * and fill it in at syscall
+				 * and always fill it in at syscall
 				 * entry time.  This makes a full
 				 * syscall record available if some
 				 * other part of the kernel decides it
diff --git a/trunk/kernel/auditfilter.c b/trunk/kernel/auditfilter.c
index a6c3f1abd206..f8277c80d678 100644
--- a/trunk/kernel/auditfilter.c
+++ b/trunk/kernel/auditfilter.c
@@ -235,15 +235,13 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 	switch(listnr) {
 	default:
 		goto exit_err;
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
 	case AUDIT_FILTER_ENTRY:
-		if (rule->action == AUDIT_ALWAYS)
-			goto exit_err;
 	case AUDIT_FILTER_EXIT:
 	case AUDIT_FILTER_TASK:
 #endif
-	case AUDIT_FILTER_USER:
-	case AUDIT_FILTER_TYPE:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -387,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 				goto exit_free;
 			break;
 		case AUDIT_FILETYPE:
-			if (f->val & ~S_IFMT)
+			if ((f->val & ~S_IFMT) > S_IFMT)
 				goto exit_free;
 			break;
 		case AUDIT_INODE:
@@ -461,8 +459,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
-		case AUDIT_OBJ_UID:
-		case AUDIT_OBJ_GID:
 			break;
 		case AUDIT_ARCH:
 			entry->rule.arch_f = f;
@@ -526,6 +522,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			break;
 		case AUDIT_FILTERKEY:
+			err = -EINVAL;
 			if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
 				goto exit_free;
 			str = audit_unpack_string(&bufp, &remain, f->val);
@@ -539,11 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			break;
 		case AUDIT_FILETYPE:
-			if (f->val & ~S_IFMT)
-				goto exit_free;
-			break;
-		case AUDIT_FIELD_COMPARE:
-			if (f->val > AUDIT_MAX_FIELD_COMPARE)
+			if ((f->val & ~S_IFMT) > S_IFMT)
 				goto exit_free;
 			break;
 		default:
diff --git a/trunk/kernel/auditsc.c b/trunk/kernel/auditsc.c
index caaea6e944f8..e7fe2b0d29b3 100644
--- a/trunk/kernel/auditsc.c
+++ b/trunk/kernel/auditsc.c
@@ -70,15 +70,9 @@
 
 #include "audit.h"
 
-/* flags stating the success for a syscall */
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname().  If we get more names we will allocate
- * a name dynamically and also add those to the list anchored by names_list. */
-#define AUDIT_NAMES	5
+ * for saving names from getname(). */
+#define AUDIT_NAMES    20
 
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
@@ -107,8 +101,9 @@ struct audit_cap_data {
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
-	struct list_head list;		/* audit_context->names_list */
 	const char	*name;
+	int		name_len;	/* number of name's characters to log */
+	unsigned	name_put;	/* call __putname() for this name */
 	unsigned long	ino;
 	dev_t		dev;
 	umode_t		mode;
@@ -118,14 +113,6 @@ struct audit_names {
 	u32		osid;
 	struct audit_cap_data fcap;
 	unsigned int	fcap_ver;
-	int		name_len;	/* number of name's characters to log */
-	bool		name_put;	/* call __putname() for this name */
-	/*
-	 * This was an allocated audit_names and not from the array of
-	 * names allocated in the task audit context.  Thus this name
-	 * should be freed on syscall exit
-	 */
-	bool		should_free;
 };
 
 struct audit_aux_data {
@@ -187,17 +174,8 @@ struct audit_context {
 	long		    return_code;/* syscall return code */
 	u64		    prio;
 	int		    return_valid; /* return code is valid */
-	/*
-	 * The names_list is the list of all audit_names collected during this
-	 * syscall.  The first AUDIT_NAMES entries in the names_list will
-	 * actually be from the preallocated_names array for performance
-	 * reasons.  Except during allocation they should never be referenced
-	 * through the preallocated_names array and should only be found/used
-	 * by running the names_list.
-	 */
-	struct audit_names  preallocated_names[AUDIT_NAMES];
-	int		    name_count; /* total records in names_list */
-	struct list_head    names_list;	/* anchor for struct audit_names->list */
+	int		    name_count;
+	struct audit_names  names[AUDIT_NAMES];
 	char *		    filterkey;	/* key for rule that triggered record */
 	struct path	    pwd;
 	struct audit_context *previous; /* For nested syscalls */
@@ -327,21 +305,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 	}
 }
 
-static int audit_match_filetype(struct audit_context *ctx, int val)
+static int audit_match_filetype(struct audit_context *ctx, int which)
 {
-	struct audit_names *n;
-	umode_t mode = (umode_t)val;
+	unsigned index = which & ~S_IFMT;
+	umode_t mode = which & S_IFMT;
 
 	if (unlikely(!ctx))
 		return 0;
 
-	list_for_each_entry(n, &ctx->names_list, list) {
-		if ((n->ino != -1) &&
-		    ((n->mode & S_IFMT) == mode))
-			return 1;
-	}
-
-	return 0;
+	if (index >= ctx->name_count)
+		return 0;
+	if (ctx->names[index].ino == -1)
+		return 0;
+	if ((ctx->names[index].mode ^ mode) & S_IFMT)
+		return 0;
+	return 1;
 }
 
 /*
@@ -463,134 +441,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 	return 0;
 }
 
-static int audit_compare_id(uid_t uid1,
-			    struct audit_names *name,
-			    unsigned long name_offset,
-			    struct audit_field *f,
-			    struct audit_context *ctx)
-{
-	struct audit_names *n;
-	unsigned long addr;
-	uid_t uid2;
-	int rc;
-
-	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
-
-	if (name) {
-		addr = (unsigned long)name;
-		addr += name_offset;
-
-		uid2 = *(uid_t *)addr;
-		rc = audit_comparator(uid1, f->op, uid2);
-		if (rc)
-			return rc;
-	}
-
-	if (ctx) {
-		list_for_each_entry(n, &ctx->names_list, list) {
-			addr = (unsigned long)n;
-			addr += name_offset;
-
-			uid2 = *(uid_t *)addr;
-
-			rc = audit_comparator(uid1, f->op, uid2);
-			if (rc)
-				return rc;
-		}
-	}
-	return 0;
-}
-
-static int audit_field_compare(struct task_struct *tsk,
-			       const struct cred *cred,
-			       struct audit_field *f,
-			       struct audit_context *ctx,
-			       struct audit_names *name)
-{
-	switch (f->val) {
-	/* process to file object comparisons */
-	case AUDIT_COMPARE_UID_TO_OBJ_UID:
-		return audit_compare_id(cred->uid,
-					name, offsetof(struct audit_names, uid),
-					f, ctx);
-	case AUDIT_COMPARE_GID_TO_OBJ_GID:
-		return audit_compare_id(cred->gid,
-					name, offsetof(struct audit_names, gid),
-					f, ctx);
-	case AUDIT_COMPARE_EUID_TO_OBJ_UID:
-		return audit_compare_id(cred->euid,
-					name, offsetof(struct audit_names, uid),
-					f, ctx);
-	case AUDIT_COMPARE_EGID_TO_OBJ_GID:
-		return audit_compare_id(cred->egid,
-					name, offsetof(struct audit_names, gid),
-					f, ctx);
-	case AUDIT_COMPARE_AUID_TO_OBJ_UID:
-		return audit_compare_id(tsk->loginuid,
-					name, offsetof(struct audit_names, uid),
-					f, ctx);
-	case AUDIT_COMPARE_SUID_TO_OBJ_UID:
-		return audit_compare_id(cred->suid,
-					name, offsetof(struct audit_names, uid),
-					f, ctx);
-	case AUDIT_COMPARE_SGID_TO_OBJ_GID:
-		return audit_compare_id(cred->sgid,
-					name, offsetof(struct audit_names, gid),
-					f, ctx);
-	case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
-		return audit_compare_id(cred->fsuid,
-					name, offsetof(struct audit_names, uid),
-					f, ctx);
-	case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
-		return audit_compare_id(cred->fsgid,
-					name, offsetof(struct audit_names, gid),
-					f, ctx);
-	/* uid comparisons */
-	case AUDIT_COMPARE_UID_TO_AUID:
-		return audit_comparator(cred->uid, f->op, tsk->loginuid);
-	case AUDIT_COMPARE_UID_TO_EUID:
-		return audit_comparator(cred->uid, f->op, cred->euid);
-	case AUDIT_COMPARE_UID_TO_SUID:
-		return audit_comparator(cred->uid, f->op, cred->suid);
-	case AUDIT_COMPARE_UID_TO_FSUID:
-		return audit_comparator(cred->uid, f->op, cred->fsuid);
-	/* auid comparisons */
-	case AUDIT_COMPARE_AUID_TO_EUID:
-		return audit_comparator(tsk->loginuid, f->op, cred->euid);
-	case AUDIT_COMPARE_AUID_TO_SUID:
-		return audit_comparator(tsk->loginuid, f->op, cred->suid);
-	case AUDIT_COMPARE_AUID_TO_FSUID:
-		return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
-	/* euid comparisons */
-	case AUDIT_COMPARE_EUID_TO_SUID:
-		return audit_comparator(cred->euid, f->op, cred->suid);
-	case AUDIT_COMPARE_EUID_TO_FSUID:
-		return audit_comparator(cred->euid, f->op, cred->fsuid);
-	/* suid comparisons */
-	case AUDIT_COMPARE_SUID_TO_FSUID:
-		return audit_comparator(cred->suid, f->op, cred->fsuid);
-	/* gid comparisons */
-	case AUDIT_COMPARE_GID_TO_EGID:
-		return audit_comparator(cred->gid, f->op, cred->egid);
-	case AUDIT_COMPARE_GID_TO_SGID:
-		return audit_comparator(cred->gid, f->op, cred->sgid);
-	case AUDIT_COMPARE_GID_TO_FSGID:
-		return audit_comparator(cred->gid, f->op, cred->fsgid);
-	/* egid comparisons */
-	case AUDIT_COMPARE_EGID_TO_SGID:
-		return audit_comparator(cred->egid, f->op, cred->sgid);
-	case AUDIT_COMPARE_EGID_TO_FSGID:
-		return audit_comparator(cred->egid, f->op, cred->fsgid);
-	/* sgid comparison */
-	case AUDIT_COMPARE_SGID_TO_FSGID:
-		return audit_comparator(cred->sgid, f->op, cred->fsgid);
-	default:
-		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
-		return 0;
-	}
-	return 0;
-}
-
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise.
@@ -607,14 +457,13 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      bool task_creation)
 {
 	const struct cred *cred;
-	int i, need_sid = 1;
+	int i, j, need_sid = 1;
 	u32 sid;
 
 	cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &rule->fields[i];
-		struct audit_names *n;
 		int result = 0;
 
 		switch (f->type) {
@@ -673,14 +522,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMAJOR:
-			if (name) {
-				if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
-				    audit_comparator(MAJOR(name->rdev), f->op, f->val))
-					++result;
-			} else if (ctx) {
-				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
-					    audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
+			if (name)
+				result = audit_comparator(MAJOR(name->dev),
+							  f->op, f->val);
+			else if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (audit_comparator(MAJOR(ctx->names[j].dev),	f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -688,14 +535,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMINOR:
-			if (name) {
-				if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
-				    audit_comparator(MINOR(name->rdev), f->op, f->val))
-					++result;
-			} else if (ctx) {
-				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
-					    audit_comparator(MINOR(n->rdev), f->op, f->val)) {
+			if (name)
+				result = audit_comparator(MINOR(name->dev),
+							  f->op, f->val);
+			else if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -706,32 +551,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (name)
 				result = (name->ino == f->val);
 			else if (ctx) {
-				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(n->ino, f->op, f->val)) {
-						++result;
-						break;
-					}
-				}
-			}
-			break;
-		case AUDIT_OBJ_UID:
-			if (name) {
-				result = audit_comparator(name->uid, f->op, f->val);
-			} else if (ctx) {
-				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(n->uid, f->op, f->val)) {
-						++result;
-						break;
-					}
-				}
-			}
-			break;
-		case AUDIT_OBJ_GID:
-			if (name) {
-				result = audit_comparator(name->gid, f->op, f->val);
-			} else if (ctx) {
-				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(n->gid, f->op, f->val)) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -786,10 +607,11 @@ static int audit_filter_rules(struct task_struct *tsk,
 					           name->osid, f->type, f->op,
 					           f->lsm_rule, ctx);
 				} else if (ctx) {
-					list_for_each_entry(n, &ctx->names_list, list) {
-						if (security_audit_rule_match(n->osid, f->type,
-									      f->op, f->lsm_rule,
-									      ctx)) {
+					for (j = 0; j < ctx->name_count; j++) {
+						if (security_audit_rule_match(
+						      ctx->names[j].osid,
+						      f->type, f->op,
+						      f->lsm_rule, ctx)) {
 							++result;
 							break;
 						}
@@ -821,10 +643,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_FILETYPE:
 			result = audit_match_filetype(ctx, f->val);
 			break;
-		case AUDIT_FIELD_COMPARE:
-			result = audit_field_compare(tsk, cred, f, ctx, name);
-			break;
 		}
+
 		if (!result)
 			return 0;
 	}
@@ -902,53 +722,40 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
-/*
- * Given an audit_name check the inode hash table to see if they match.
- * Called holding the rcu read lock to protect the use of audit_inode_hash
- */
-static int audit_filter_inode_name(struct task_struct *tsk,
-				   struct audit_names *n,
-				   struct audit_context *ctx) {
-	int word, bit;
-	int h = audit_hash_ino((u32)n->ino);
-	struct list_head *list = &audit_inode_hash[h];
-	struct audit_entry *e;
-	enum audit_state state;
-
-	word = AUDIT_WORD(ctx->major);
-	bit  = AUDIT_BIT(ctx->major);
-
-	if (list_empty(list))
-		return 0;
-
-	list_for_each_entry_rcu(e, list, list) {
-		if ((e->rule.mask[word] & bit) == bit &&
-		    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
-			ctx->current_state = state;
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-/* At syscall exit time, this filter is called if any audit_names have been
+/* At syscall exit time, this filter is called if any audit_names[] have been
  * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names.
+ * buckets applicable to the inode numbers in audit_names[].
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-	struct audit_names *n;
+	int i;
+	struct audit_entry *e;
+	enum audit_state state;
 
 	if (audit_pid && tsk->tgid == audit_pid)
 		return;
 
 	rcu_read_lock();
+	for (i = 0; i < ctx->name_count; i++) {
+		int word = AUDIT_WORD(ctx->major);
+		int bit  = AUDIT_BIT(ctx->major);
+		struct audit_names *n = &ctx->names[i];
+		int h = audit_hash_ino((u32)n->ino);
+		struct list_head *list = &audit_inode_hash[h];
 
-	list_for_each_entry(n, &ctx->names_list, list) {
-		if (audit_filter_inode_name(tsk, n, ctx))
-			break;
+		if (list_empty(list))
+			continue;
+
+		list_for_each_entry_rcu(e, list, list) {
+			if ((e->rule.mask[word] & bit) == bit &&
+			    audit_filter_rules(tsk, &e->rule, ctx, n,
+				    	       &state, false)) {
+				rcu_read_unlock();
+				ctx->current_state = state;
+				return;
+			}
+		}
 	}
 	rcu_read_unlock();
 }
@@ -959,7 +766,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
 	struct audit_context *context = tsk->audit_context;
 
-	if (!context)
+	if (likely(!context))
 		return NULL;
 	context->return_valid = return_valid;
 
@@ -992,7 +799,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 static inline void audit_free_names(struct audit_context *context)
 {
-	struct audit_names *n, *next;
+	int i;
 
 #if AUDIT_DEBUG == 2
 	if (context->put_count + context->ino_count != context->name_count) {
@@ -1003,9 +810,10 @@ static inline void audit_free_names(struct audit_context *context)
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
-		list_for_each_entry(n, &context->names_list, list) {
+		for (i = 0; i < context->name_count; i++) {
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
-			       n->name, n->name ?: "(null)");
+			       context->names[i].name,
+			       context->names[i].name ?: "(null)");
 		}
 		dump_stack();
 		return;
@@ -1016,12 +824,9 @@ static inline void audit_free_names(struct audit_context *context)
 	context->ino_count  = 0;
 #endif
 
-	list_for_each_entry_safe(n, next, &context->names_list, list) {
-		list_del(&n->list);
-		if (n->name && n->name_put)
-			__putname(n->name);
-		if (n->should_free)
-			kfree(n);
+	for (i = 0; i < context->name_count; i++) {
+		if (context->names[i].name && context->names[i].name_put)
+			__putname(context->names[i].name);
 	}
 	context->name_count = 0;
 	path_put(&context->pwd);
@@ -1059,7 +864,6 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 		return NULL;
 	audit_zero_context(context, state);
 	INIT_LIST_HEAD(&context->killed_trees);
-	INIT_LIST_HEAD(&context->names_list);
 	return context;
 }
 
@@ -1082,7 +886,7 @@ int audit_alloc(struct task_struct *tsk)
 		return 0; /* Return if not auditing. */
 
 	state = audit_filter_task(tsk, &key);
-	if (state == AUDIT_DISABLED)
+	if (likely(state == AUDIT_DISABLED))
 		return 0;
 
 	if (!(context = audit_alloc_context(state))) {
@@ -1171,7 +975,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 		while (vma) {
 			if ((vma->vm_flags & VM_EXECUTABLE) &&
 			    vma->vm_file) {
-				audit_log_d_path(ab, " exe=",
+				audit_log_d_path(ab, "exe=",
 						 &vma->vm_file->f_path);
 				break;
 			}
@@ -1362,8 +1166,8 @@ static void audit_log_execve_info(struct audit_context *context,
 				  struct audit_buffer **ab,
 				  struct audit_aux_data_execve *axi)
 {
-	int i, len;
-	size_t len_sent = 0;
+	int i;
+	size_t len, len_sent = 0;
 	const char __user *p;
 	char *buf;
 
@@ -1520,68 +1324,6 @@ static void show_special(struct audit_context *context, int *call_panic)
 	audit_log_end(ab);
 }
 
-static void audit_log_name(struct audit_context *context, struct audit_names *n,
-			   int record_num, int *call_panic)
-{
-	struct audit_buffer *ab;
-	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-	if (!ab)
-		return; /* audit_panic has been called */
-
-	audit_log_format(ab, "item=%d", record_num);
-
-	if (n->name) {
-		switch (n->name_len) {
-		case AUDIT_NAME_FULL:
-			/* log the full path */
-			audit_log_format(ab, " name=");
-			audit_log_untrustedstring(ab, n->name);
-			break;
-		case 0:
-			/* name was specified as a relative path and the
-			 * directory component is the cwd */
-			audit_log_d_path(ab, " name=", &context->pwd);
-			break;
-		default:
-			/* log the name's directory component */
-			audit_log_format(ab, " name=");
-			audit_log_n_untrustedstring(ab, n->name,
-						    n->name_len);
-		}
-	} else
-		audit_log_format(ab, " name=(null)");
-
-	if (n->ino != (unsigned long)-1) {
-		audit_log_format(ab, " inode=%lu"
-				 " dev=%02x:%02x mode=%#ho"
-				 " ouid=%u ogid=%u rdev=%02x:%02x",
-				 n->ino,
-				 MAJOR(n->dev),
-				 MINOR(n->dev),
-				 n->mode,
-				 n->uid,
-				 n->gid,
-				 MAJOR(n->rdev),
-				 MINOR(n->rdev));
-	}
-	if (n->osid != 0) {
-		char *ctx = NULL;
-		u32 len;
-		if (security_secid_to_secctx(
-			n->osid, &ctx, &len)) {
-			audit_log_format(ab, " osid=%u", n->osid);
-			*call_panic = 2;
-		} else {
-			audit_log_format(ab, " obj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
-
-	audit_log_fcaps(ab, n);
-
-	audit_log_end(ab);
-}
-
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	const struct cred *cred;
@@ -1589,7 +1331,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 	const char *tty;
-	struct audit_names *n;
 
 	/* tsk == current */
 	context->pid = tsk->pid;
@@ -1725,14 +1466,70 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
-			audit_log_d_path(ab, " cwd=", &context->pwd);
+			audit_log_d_path(ab, "cwd=", &context->pwd);
 			audit_log_end(ab);
 		}
 	}
+	for (i = 0; i < context->name_count; i++) {
+		struct audit_names *n = &context->names[i];
 
-	i = 0;
-	list_for_each_entry(n, &context->names_list, list)
-		audit_log_name(context, n, i++, &call_panic);
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+		if (!ab)
+			continue; /* audit_panic has been called */
+
+		audit_log_format(ab, "item=%d", i);
+
+		if (n->name) {
+			switch(n->name_len) {
+			case AUDIT_NAME_FULL:
+				/* log the full path */
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab, n->name);
+				break;
+			case 0:
+				/* name was specified as a relative path and the
+				 * directory component is the cwd */
+				audit_log_d_path(ab, "name=", &context->pwd);
+				break;
+			default:
+				/* log the name's directory component */
+				audit_log_format(ab, " name=");
+				audit_log_n_untrustedstring(ab, n->name,
+							    n->name_len);
+			}
+		} else
+			audit_log_format(ab, " name=(null)");
+
+		if (n->ino != (unsigned long)-1) {
+			audit_log_format(ab, " inode=%lu"
+					 " dev=%02x:%02x mode=%#ho"
+					 " ouid=%u ogid=%u rdev=%02x:%02x",
+					 n->ino,
+					 MAJOR(n->dev),
+					 MINOR(n->dev),
+					 n->mode,
+					 n->uid,
+					 n->gid,
+					 MAJOR(n->rdev),
+					 MINOR(n->rdev));
+		}
+		if (n->osid != 0) {
+			char *ctx = NULL;
+			u32 len;
+			if (security_secid_to_secctx(
+				n->osid, &ctx, &len)) {
+				audit_log_format(ab, " osid=%u", n->osid);
+				call_panic = 2;
+			} else {
+				audit_log_format(ab, " obj=%s", ctx);
+				security_release_secctx(ctx, len);
+			}
+		}
+
+		audit_log_fcaps(ab, n);
+
+		audit_log_end(ab);
+	}
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1748,12 +1545,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
  *
  * Called from copy_process and do_exit
  */
-void __audit_free(struct task_struct *tsk)
+void audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
 
 	context = audit_get_context(tsk, 0, 0);
-	if (!context)
+	if (likely(!context))
 		return;
 
 	/* Check for system calls that do not go through the exit
@@ -1786,7 +1583,7 @@ void __audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void __audit_syscall_entry(int arch, int major,
+void audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
@@ -1794,7 +1591,7 @@ void __audit_syscall_entry(int arch, int major,
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
-	if (!context)
+	if (unlikely(!context))
 		return;
 
 	/*
@@ -1851,7 +1648,7 @@ void __audit_syscall_entry(int arch, int major,
 		context->prio = 0;
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
 	}
-	if (state == AUDIT_DISABLED)
+	if (likely(state == AUDIT_DISABLED))
 		return;
 
 	context->serial     = 0;
@@ -1861,9 +1658,30 @@ void __audit_syscall_entry(int arch, int major,
 	context->ppid       = 0;
 }
 
+void audit_finish_fork(struct task_struct *child)
+{
+	struct audit_context *ctx = current->audit_context;
+	struct audit_context *p = child->audit_context;
+	if (!p || !ctx)
+		return;
+	if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
+		return;
+	p->arch = ctx->arch;
+	p->major = ctx->major;
+	memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
+	p->ctime = ctx->ctime;
+	p->dummy = ctx->dummy;
+	p->in_syscall = ctx->in_syscall;
+	p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
+	p->ppid = current->pid;
+	p->prio = ctx->prio;
+	p->current_state = ctx->current_state;
+}
+
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @pt_regs: syscall registers
+ * @valid: success/failure flag
+ * @return_code: syscall return value
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1871,18 +1689,14 @@ void __audit_syscall_entry(int arch, int major,
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void __audit_syscall_exit(int success, long return_code)
+void audit_syscall_exit(int valid, long return_code)
 {
 	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	if (success)
-		success = AUDITSC_SUCCESS;
-	else
-		success = AUDITSC_FAILURE;
+	context = audit_get_context(tsk, valid, return_code);
 
-	context = audit_get_context(tsk, success, return_code);
-	if (!context)
+	if (likely(!context))
 		return;
 
 	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -2007,30 +1821,6 @@ static void handle_path(const struct dentry *dentry)
 #endif
 }
 
-static struct audit_names *audit_alloc_name(struct audit_context *context)
-{
-	struct audit_names *aname;
-
-	if (context->name_count < AUDIT_NAMES) {
-		aname = &context->preallocated_names[context->name_count];
-		memset(aname, 0, sizeof(*aname));
-	} else {
-		aname = kzalloc(sizeof(*aname), GFP_NOFS);
-		if (!aname)
-			return NULL;
-		aname->should_free = true;
-	}
-
-	aname->ino = (unsigned long)-1;
-	list_add_tail(&aname->list, &context->names_list);
-
-	context->name_count++;
-#if AUDIT_DEBUG
-	context->ino_count++;
-#endif
-	return aname;
-}
-
 /**
  * audit_getname - add a name to the list
  * @name: name to add
@@ -2041,7 +1831,9 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
 void __audit_getname(const char *name)
 {
 	struct audit_context *context = current->audit_context;
-	struct audit_names *n;
+
+	if (IS_ERR(name) || !name)
+		return;
 
 	if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
@@ -2051,15 +1843,13 @@ void __audit_getname(const char *name)
 #endif
 		return;
 	}
-
-	n = audit_alloc_name(context);
-	if (!n)
-		return;
-
-	n->name = name;
-	n->name_len = AUDIT_NAME_FULL;
-	n->name_put = true;
-
+	BUG_ON(context->name_count >= AUDIT_NAMES);
+	context->names[context->name_count].name = name;
+	context->names[context->name_count].name_len = AUDIT_NAME_FULL;
+	context->names[context->name_count].name_put = 1;
+	context->names[context->name_count].ino  = (unsigned long)-1;
+	context->names[context->name_count].osid = 0;
+	++context->name_count;
 	if (!context->pwd.dentry)
 		get_fs_pwd(current->fs, &context->pwd);
 }
@@ -2081,13 +1871,12 @@ void audit_putname(const char *name)
 		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
 		       __FILE__, __LINE__, context->serial, name);
 		if (context->name_count) {
-			struct audit_names *n;
 			int i;
-
-			list_for_each_entry(n, &context->names_list, list)
+			for (i = 0; i < context->name_count; i++)
 				printk(KERN_ERR "name[%d] = %p = %s\n", i,
-				       n->name, n->name ?: "(null)");
-			}
+				       context->names[i].name,
+				       context->names[i].name ?: "(null)");
+		}
 #endif
 		__putname(name);
 	}
@@ -2108,11 +1897,39 @@ void audit_putname(const char *name)
 #endif
 }
 
+static int audit_inc_name_count(struct audit_context *context,
+				const struct inode *inode)
+{
+	if (context->name_count >= AUDIT_NAMES) {
+		if (inode)
+			printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
+			       "dev=%02x:%02x, inode=%lu\n",
+			       MAJOR(inode->i_sb->s_dev),
+			       MINOR(inode->i_sb->s_dev),
+			       inode->i_ino);
+
+		else
+			printk(KERN_DEBUG "name_count maxed, losing inode data\n");
+		return 1;
+	}
+	context->name_count++;
+#if AUDIT_DEBUG
+	context->ino_count++;
+#endif
+	return 0;
+}
+
+
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
 	struct cpu_vfs_cap_data caps;
 	int rc;
 
+	memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
+	memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
+	name->fcap.fE = 0;
+	name->fcap_ver = 0;
+
 	if (!dentry)
 		return 0;
 
@@ -2152,25 +1969,30 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
  */
 void __audit_inode(const char *name, const struct dentry *dentry)
 {
+	int idx;
 	struct audit_context *context = current->audit_context;
 	const struct inode *inode = dentry->d_inode;
-	struct audit_names *n;
 
 	if (!context->in_syscall)
 		return;
-
-	list_for_each_entry_reverse(n, &context->names_list, list) {
-		if (n->name && (n->name == name))
-			goto out;
+	if (context->name_count
+	    && context->names[context->name_count-1].name
+	    && context->names[context->name_count-1].name == name)
+		idx = context->name_count - 1;
+	else if (context->name_count > 1
+		 && context->names[context->name_count-2].name
+		 && context->names[context->name_count-2].name == name)
+		idx = context->name_count - 2;
+	else {
+		/* FIXME: how much do we care about inodes that have no
+		 * associated name? */
+		if (audit_inc_name_count(context, inode))
+			return;
+		idx = context->name_count - 1;
+		context->names[idx].name = NULL;
 	}
-
-	/* unable to find the name from a previous getname() */
-	n = audit_alloc_name(context);
-	if (!n)
-		return;
-out:
 	handle_path(dentry);
-	audit_copy_inode(n, dentry, inode);
+	audit_copy_inode(&context->names[idx], dentry, inode);
 }
 
 /**
@@ -2189,11 +2011,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 void __audit_inode_child(const struct dentry *dentry,
 			 const struct inode *parent)
 {
+	int idx;
 	struct audit_context *context = current->audit_context;
 	const char *found_parent = NULL, *found_child = NULL;
 	const struct inode *inode = dentry->d_inode;
 	const char *dname = dentry->d_name.name;
-	struct audit_names *n;
 	int dirlen = 0;
 
 	if (!context->in_syscall)
@@ -2203,7 +2025,9 @@ void __audit_inode_child(const struct dentry *dentry,
 		handle_one(inode);
 
 	/* parent is more likely, look for it first */
-	list_for_each_entry(n, &context->names_list, list) {
+	for (idx = 0; idx < context->name_count; idx++) {
+		struct audit_names *n = &context->names[idx];
+
 		if (!n->name)
 			continue;
 
@@ -2216,7 +2040,9 @@ void __audit_inode_child(const struct dentry *dentry,
 	}
 
 	/* no matching parent, look for matching child */
-	list_for_each_entry(n, &context->names_list, list) {
+	for (idx = 0; idx < context->name_count; idx++) {
+		struct audit_names *n = &context->names[idx];
+
 		if (!n->name)
 			continue;
 
@@ -2234,29 +2060,34 @@ void __audit_inode_child(const struct dentry *dentry,
 
 add_names:
 	if (!found_parent) {
-		n = audit_alloc_name(context);
-		if (!n)
+		if (audit_inc_name_count(context, parent))
 			return;
-		audit_copy_inode(n, NULL, parent);
+		idx = context->name_count - 1;
+		context->names[idx].name = NULL;
+		audit_copy_inode(&context->names[idx], NULL, parent);
 	}
 
 	if (!found_child) {
-		n = audit_alloc_name(context);
-		if (!n)
+		if (audit_inc_name_count(context, inode))
 			return;
+		idx = context->name_count - 1;
 
 		/* Re-use the name belonging to the slot for a matching parent
 		 * directory. All names for this context are relinquished in
 		 * audit_free_names() */
 		if (found_parent) {
-			n->name = found_parent;
-			n->name_len = AUDIT_NAME_FULL;
+			context->names[idx].name = found_parent;
+			context->names[idx].name_len = AUDIT_NAME_FULL;
 			/* don't call __putname() */
-			n->name_put = false;
+			context->names[idx].name_put = 0;
+		} else {
+			context->names[idx].name = NULL;
 		}
 
 		if (inode)
-			audit_copy_inode(n, NULL, inode);
+			audit_copy_inode(&context->names[idx], NULL, inode);
+		else
+			context->names[idx].ino = (unsigned long)-1;
 	}
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2290,28 +2121,19 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 
 /**
- * audit_set_loginuid - set current task's audit_context loginuid
+ * audit_set_loginuid - set a task's audit_context loginuid
+ * @task: task whose audit context is being modified
  * @loginuid: loginuid value
  *
  * Returns 0.
  *
  * Called (set) from fs/proc/base.c::proc_loginuid_write().
  */
-int audit_set_loginuid(uid_t loginuid)
+int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
-	struct task_struct *task = current;
+	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
-	unsigned int sessionid;
-
-#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
-	if (task->loginuid != -1)
-		return -EPERM;
-#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
-	if (!capable(CAP_AUDIT_CONTROL))
-		return -EPERM;
-#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 
-	sessionid = atomic_inc_return(&session_id);
 	if (context && context->in_syscall) {
 		struct audit_buffer *ab;
 
@@ -2449,11 +2271,14 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
 	context->ipc.has_perm = 1;
 }
 
-int __audit_bprm(struct linux_binprm *bprm)
+int audit_bprm(struct linux_binprm *bprm)
 {
 	struct audit_aux_data_execve *ax;
 	struct audit_context *context = current->audit_context;
 
+	if (likely(!audit_enabled || !context || context->dummy))
+		return 0;
+
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
 		return -ENOMEM;
@@ -2474,10 +2299,13 @@ int __audit_bprm(struct linux_binprm *bprm)
  * @args: args array
  *
  */
-void __audit_socketcall(int nargs, unsigned long *args)
+void audit_socketcall(int nargs, unsigned long *args)
 {
 	struct audit_context *context = current->audit_context;
 
+	if (likely(!context || context->dummy))
+		return;
+
 	context->type = AUDIT_SOCKETCALL;
 	context->socketcall.nargs = nargs;
 	memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2503,10 +2331,13 @@ void __audit_fd_pair(int fd1, int fd2)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_sockaddr(int len, void *a)
+int audit_sockaddr(int len, void *a)
 {
 	struct audit_context *context = current->audit_context;
 
+	if (likely(!context || context->dummy))
+		return 0;
+
 	if (!context->sockaddr) {
 		void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
 		if (!p)
@@ -2668,25 +2499,6 @@ void __audit_mmap_fd(int fd, int flags)
 	context->type = AUDIT_MMAP;
 }
 
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
-{
-	uid_t auid, uid;
-	gid_t gid;
-	unsigned int sessionid;
-
-	auid = audit_get_loginuid(current);
-	sessionid = audit_get_sessionid(current);
-	current_uid_gid(&uid, &gid);
-
-	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-			 auid, uid, gid, sessionid);
-	audit_log_task_context(ab);
-	audit_log_format(ab, " pid=%d comm=", current->pid);
-	audit_log_untrustedstring(ab, current->comm);
-	audit_log_format(ab, " reason=");
-	audit_log_string(ab, reason);
-	audit_log_format(ab, " sig=%ld", signr);
-}
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
@@ -2697,6 +2509,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
 void audit_core_dumps(long signr)
 {
 	struct audit_buffer *ab;
+	u32 sid;
+	uid_t auid = audit_get_loginuid(current), uid;
+	gid_t gid;
+	unsigned int sessionid = audit_get_sessionid(current);
 
 	if (!audit_enabled)
 		return;
@@ -2705,17 +2521,24 @@ void audit_core_dumps(long signr)
 		return;
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-	audit_log_abend(ab, "memory violation", signr);
-	audit_log_end(ab);
-}
-
-void __audit_seccomp(unsigned long syscall)
-{
-	struct audit_buffer *ab;
+	current_uid_gid(&uid, &gid);
+	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+			 auid, uid, gid, sessionid);
+	security_task_getsecid(current, &sid);
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
 
-	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-	audit_log_abend(ab, "seccomp", SIGKILL);
-	audit_log_format(ab, " syscall=%ld", syscall);
+		if (security_secid_to_secctx(sid, &ctx, &len))
+			audit_log_format(ab, " ssid=%u", sid);
+		else {
+			audit_log_format(ab, " subj=%s", ctx);
+			security_release_secctx(ctx, len);
+		}
+	}
+	audit_log_format(ab, " pid=%d comm=", current->pid);
+	audit_log_untrustedstring(ab, current->comm);
+	audit_log_format(ab, " sig=%ld", signr);
 	audit_log_end(ab);
 }
 
diff --git a/trunk/kernel/capability.c b/trunk/kernel/capability.c
index 3f1adb6c6470..0fcf1c14a297 100644
--- a/trunk/kernel/capability.c
+++ b/trunk/kernel/capability.c
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (security_capable(current_cred(), ns, cap) == 0) {
+	if (has_ns_capability(current, ns, cap)) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c
index 294b1709170d..c44738267be7 100644
--- a/trunk/kernel/exit.c
+++ b/trunk/kernel/exit.c
@@ -964,7 +964,8 @@ void do_exit(long code)
 	acct_collect(code, group_dead);
 	if (group_dead)
 		tty_audit_exit();
-	audit_free(tsk);
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
 
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c
index 051f090d40c1..f3fa18887cc9 100644
--- a/trunk/kernel/fork.c
+++ b/trunk/kernel/fork.c
@@ -1527,6 +1527,8 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
+		audit_finish_fork(p);
+
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that
diff --git a/trunk/kernel/seccomp.c b/trunk/kernel/seccomp.c
index e8d76c5895ea..57d4b13b631d 100644
--- a/trunk/kernel/seccomp.c
+++ b/trunk/kernel/seccomp.c
@@ -6,7 +6,6 @@
  * This defines a simple but solid secure-computing mode.
  */
 
-#include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
@@ -55,7 +54,6 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
-	audit_seccomp(this_syscall);
 	do_exit(SIGKILL);
 }
 
diff --git a/trunk/security/integrity/ima/ima_audit.c b/trunk/security/integrity/ima/ima_audit.c
index 2ad942fb1e23..c5c5a72c30be 100644
--- a/trunk/security/integrity/ima/ima_audit.c
+++ b/trunk/security/integrity/ima/ima_audit.c
@@ -56,11 +56,9 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 		audit_log_format(ab, " name=");
 		audit_log_untrustedstring(ab, fname);
 	}
-	if (inode) {
-		audit_log_format(ab, " dev=");
-		audit_log_untrustedstring(ab, inode->i_sb->s_id);
-		audit_log_format(ab, " ino=%lu", inode->i_ino);
-	}
+	if (inode)
+		audit_log_format(ab, " dev=%s ino=%lu",
+				 inode->i_sb->s_id, inode->i_ino);
 	audit_log_format(ab, " res=%d", !result ? 0 : 1);
 	audit_log_end(ab);
 }
diff --git a/trunk/security/lsm_audit.c b/trunk/security/lsm_audit.c
index 293b8c45b1d1..7bd6f138236b 100644
--- a/trunk/security/lsm_audit.c
+++ b/trunk/security/lsm_audit.c
@@ -232,14 +232,13 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_PATH: {
 		struct inode *inode;
 
-		audit_log_d_path(ab, " path=", &a->u.path);
+		audit_log_d_path(ab, "path=", &a->u.path);
 
 		inode = a->u.path.dentry->d_inode;
-		if (inode) {
-			audit_log_format(ab, " dev=");
-			audit_log_untrustedstring(ab, inode->i_sb->s_id);
-			audit_log_format(ab, " ino=%lu", inode->i_ino);
-		}
+		if (inode)
+			audit_log_format(ab, " dev=%s ino=%lu",
+					inode->i_sb->s_id,
+					inode->i_ino);
 		break;
 	}
 	case LSM_AUDIT_DATA_DENTRY: {
@@ -249,11 +248,10 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		audit_log_untrustedstring(ab, a->u.dentry->d_name.name);
 
 		inode = a->u.dentry->d_inode;
-		if (inode) {
-			audit_log_format(ab, " dev=");
-			audit_log_untrustedstring(ab, inode->i_sb->s_id);
-			audit_log_format(ab, " ino=%lu", inode->i_ino);
-		}
+		if (inode)
+			audit_log_format(ab, " dev=%s ino=%lu",
+					inode->i_sb->s_id,
+					inode->i_ino);
 		break;
 	}
 	case LSM_AUDIT_DATA_INODE: {
@@ -268,9 +266,8 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 					 dentry->d_name.name);
 			dput(dentry);
 		}
-		audit_log_format(ab, " dev=");
-		audit_log_untrustedstring(ab, inode->i_sb->s_id);
-		audit_log_format(ab, " ino=%lu", inode->i_ino);
+		audit_log_format(ab, " dev=%s ino=%lu", inode->i_sb->s_id,
+				 inode->i_ino);
 		break;
 	}
 	case LSM_AUDIT_DATA_TASK:
@@ -318,7 +315,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 						.dentry = u->dentry,
 						.mnt = u->mnt
 					};
-					audit_log_d_path(ab, " path=", &path);
+					audit_log_d_path(ab, "path=", &path);
 					break;
 				}
 				if (!u->addr)
diff --git a/trunk/sound/core/Kconfig b/trunk/sound/core/Kconfig
index b413ed05e74d..ad409381f8cc 100644
--- a/trunk/sound/core/Kconfig
+++ b/trunk/sound/core/Kconfig
@@ -12,9 +12,6 @@ config SND_HWDEP
 config SND_RAWMIDI
 	tristate
 
-config SND_COMPRESS_OFFLOAD
-	tristate
-
 # To be effective this also requires INPUT - users should say:
 #    select SND_JACK if INPUT=y || INPUT=SND
 # to avoid having to force INPUT on.
@@ -157,6 +154,16 @@ config SND_DYNAMIC_MINORS
 
 	  If you are unsure about this, say N here.
 
+config SND_COMPRESS_OFFLOAD
+	tristate "ALSA Compressed audio offload support"
+	default n
+	help
+	  If you want support for offloading compressed audio and have such
+	  a hardware, then you should say Y here and also to the DSP driver
+	  of your platform.
+
+	  If you are unsure about this, say N here.
+
 config SND_SUPPORT_OLD_API
 	bool "Support old ALSA API"
 	default y
diff --git a/trunk/sound/pci/au88x0/au88x0.c b/trunk/sound/pci/au88x0/au88x0.c
index f13ad536b2d5..762bb108c51c 100644
--- a/trunk/sound/pci/au88x0/au88x0.c
+++ b/trunk/sound/pci/au88x0/au88x0.c
@@ -268,14 +268,8 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
 		card->shortname, chip->io, chip->irq);
 
 	// (4) Alloc components.
-	err = snd_vortex_mixer(chip);
-	if (err < 0) {
-		snd_card_free(card);
-		return err;
-	}
 	// ADB pcm.
-	err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_PCM);
-	if (err < 0) {
+	if ((err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_ADB)) < 0) {
 		snd_card_free(card);
 		return err;
 	}
@@ -305,6 +299,11 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
 		return err;
 	}
 #endif
+	// snd_ac97_mixer and Vortex mixer.
+	if ((err = snd_vortex_mixer(chip)) < 0) {
+		snd_card_free(card);
+		return err;
+	}
 	if ((err = snd_vortex_midi(chip)) < 0) {
 		snd_card_free(card);
 		return err;
diff --git a/trunk/sound/pci/au88x0/au88x0.h b/trunk/sound/pci/au88x0/au88x0.h
index bb938153a964..02f6e08f7592 100644
--- a/trunk/sound/pci/au88x0/au88x0.h
+++ b/trunk/sound/pci/au88x0/au88x0.h
@@ -105,7 +105,6 @@
 #define MIX_SPDIF(x) (vortex->mixspdif[x])
 
 #define NR_WTPB 0x20		/* WT channels per each bank. */
-#define NR_PCM	0x10
 
 /* Structs */
 typedef struct {
diff --git a/trunk/sound/pci/au88x0/au88x0_pcm.c b/trunk/sound/pci/au88x0/au88x0_pcm.c
index 0ef2f9712208..0488633ea874 100644
--- a/trunk/sound/pci/au88x0/au88x0_pcm.c
+++ b/trunk/sound/pci/au88x0/au88x0_pcm.c
@@ -168,7 +168,6 @@ static int snd_vortex_pcm_open(struct snd_pcm_substream *substream)
 			runtime->hw = snd_vortex_playback_hw_adb;
 #ifdef CHIP_AU8830
 		if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
-			VORTEX_IS_QUAD(vortex) &&
 			VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) {
 			runtime->hw.channels_max = 4;
 			snd_pcm_hw_constraint_list(runtime, 0,
diff --git a/trunk/sound/pci/hda/hda_intel.c b/trunk/sound/pci/hda/hda_intel.c
index fb35474c1203..0852e204a4c8 100644
--- a/trunk/sound/pci/hda/hda_intel.c
+++ b/trunk/sound/pci/hda/hda_intel.c
@@ -2498,7 +2498,6 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = {
 	SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB),
-	SND_PCI_QUIRK(0x10de, 0xcb89, "Macbook Pro 7,1", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB),
diff --git a/trunk/sound/pci/hda/patch_sigmatel.c b/trunk/sound/pci/hda/patch_sigmatel.c
index 3556408d6ece..87e684fa830f 100644
--- a/trunk/sound/pci/hda/patch_sigmatel.c
+++ b/trunk/sound/pci/hda/patch_sigmatel.c
@@ -1596,7 +1596,7 @@ static const struct snd_pci_quirk stac92hd73xx_cfg_tbl[] = {
 	SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd,
 				"Dell Studio 1557", STAC_DELL_M6_DMIC),
 	SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe,
-				"Dell Studio XPS 1645", STAC_DELL_M6_DMIC),
+				"Dell Studio XPS 1645", STAC_DELL_M6_BOTH),
 	SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413,
 				"Dell Studio 1558", STAC_DELL_M6_DMIC),
 	{} /* terminator */
diff --git a/trunk/sound/pci/oxygen/xonar_wm87x6.c b/trunk/sound/pci/oxygen/xonar_wm87x6.c
index 63cff90706bf..478303e6c2b0 100644
--- a/trunk/sound/pci/oxygen/xonar_wm87x6.c
+++ b/trunk/sound/pci/oxygen/xonar_wm87x6.c
@@ -177,7 +177,6 @@ static void wm8776_registers_init(struct oxygen *chip)
 	struct xonar_wm87x6 *data = chip->model_data;
 
 	wm8776_write(chip, WM8776_RESET, 0);
-	wm8776_write(chip, WM8776_PHASESWAP, WM8776_PH_MASK);
 	wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN |
 		     WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT);
 	wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0);