diff --git a/[refs] b/[refs]
index b745838783e8..722e4d82aa4d 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 7a1fb5d06d3936c0982e2cf8b53b046244a9aad6
+refs/heads/master: 8d7773a32d8aa723030712b0a500a4a402a21c85
diff --git a/trunk/Documentation/ABI/testing/sysfs-fs-ext4 b/trunk/Documentation/ABI/testing/sysfs-fs-ext4
deleted file mode 100644
index 4e79074de282..000000000000
--- a/trunk/Documentation/ABI/testing/sysfs-fs-ext4
+++ /dev/null
@@ -1,81 +0,0 @@
-What:		/sys/fs/ext4/<disk>/mb_stats
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		 Controls whether the multiblock allocator should
-		 collect statistics, which are shown during the unmount.
-		 1 means to collect statistics, 0 means not to collect
-		 statistics
-
-What:		/sys/fs/ext4/<disk>/mb_group_prealloc
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		The multiblock allocator will round up allocation
-		requests to a multiple of this tuning parameter if the
-		stripe size is not set in the ext4 superblock
-
-What:		/sys/fs/ext4/<disk>/mb_max_to_scan
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		The maximum number of extents the multiblock allocator
-		will search to find the best extent
-
-What:		/sys/fs/ext4/<disk>/mb_min_to_scan
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		The minimum number of extents the multiblock allocator
-		will search to find the best extent
-
-What:		/sys/fs/ext4/<disk>/mb_order2_req
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		Tuning parameter which controls the minimum size for 
-		requests (as a power of 2) where the buddy cache is
-		used
-
-What:		/sys/fs/ext4/<disk>/mb_stream_req
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		Files which have fewer blocks than this tunable
-		parameter will have their blocks allocated out of a
-		block group specific preallocation pool, so that small
-		files are packed closely together.  Each large file
-		 will have its blocks allocated out of its own unique
-		 preallocation pool.
-
-What:		/sys/fs/ext4/<disk>/inode_readahead
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		Tuning parameter which controls the maximum number of
-		inode table blocks that ext4's inode table readahead
-		algorithm will pre-read into the buffer cache
-
-What:		/sys/fs/ext4/<disk>/delayed_allocation_blocks
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		This file is read-only and shows the number of blocks
-		that are dirty in the page cache, but which do not
-		have their location in the filesystem allocated yet.
-
-What:		/sys/fs/ext4/<disk>/lifetime_write_kbytes
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		This file is read-only and shows the number of kilobytes
-		of data that have been written to this filesystem since it was
-		created.
-
-What:		/sys/fs/ext4/<disk>/session_write_kbytes
-Date:		March 2008
-Contact:	"Theodore Ts'o" <tytso@mit.edu>
-Description:
-		This file is read-only and shows the number of
-		kilobytes of data that have been written to this
-		filesystem since it was mounted.
diff --git a/trunk/Documentation/filesystems/ext4.txt b/trunk/Documentation/filesystems/ext4.txt
index 97882df04865..cec829bc7291 100644
--- a/trunk/Documentation/filesystems/ext4.txt
+++ b/trunk/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redundancy in tree
 * improved file allocation (multi-block alloc)
-* lift 32000 subdirectory limit imposed by i_links_count[1]
+* fix 32000 subdirectory limit
 * nsec timestamps for mtime, atime, ctime, create time
 * inode version field on disk (NFSv4, Lustre)
 * reduced e2fsck time via uninit_bg feature
@@ -100,9 +100,6 @@ Note: More extensive information for getting started with ext4 can be
 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
   the ordering)
 
-[1] Filesystems with a block size of 1k may see a limit imposed by the
-directory hash tree having a maximum depth of two.
-
 2.2 Candidate features for future inclusion
 
 * Online defrag (patches available but not well tested)
@@ -183,8 +180,8 @@ commit=nrsec	(*)	Ext4 can be told to sync all its data and metadata
 			performance.
 
 barrier=<0|1(*)>	This enables/disables the use of write barriers in
-barrier(*)		the jbd code.  barrier=0 disables, barrier=1 enables.
-nobarrier		This also requires an IO stack which can support
+			the jbd code.  barrier=0 disables, barrier=1 enables.
+			This also requires an IO stack which can support
 			barriers, and if jbd gets an error on a barrier
 			write, it will disable again with a warning.
 			Write barriers enforce proper on-disk ordering
@@ -192,9 +189,6 @@ nobarrier		This also requires an IO stack which can support
 			safe to use, at some performance penalty.  If
 			your disks are battery-backed in one way or another,
 			disabling barriers may safely improve performance.
-			The mount options "barrier" and "nobarrier" can
-			also be used to enable or disable barriers, for
-			consistency with other ext4 mount options.
 
 inode_readahead=n	This tuning parameter controls the maximum
 			number of inode table blocks that ext4's inode
@@ -316,24 +310,6 @@ journal_ioprio=prio	The I/O priority (from 0 to 7, where 0 is the
 			a slightly higher priority than the default I/O
 			priority.
 
-auto_da_alloc(*)	Many broken applications don't use fsync() when 
-noauto_da_alloc		replacing existing files via patterns such as
-			fd = open("foo.new")/write(fd,..)/close(fd)/
-			rename("foo.new", "foo"), or worse yet,
-			fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
-			If auto_da_alloc is enabled, ext4 will detect
-			the replace-via-rename and replace-via-truncate
-			patterns and force that any delayed allocation
-			blocks are allocated such that at the next
-			journal commit, in the default data=ordered
-			mode, the data blocks of the new file are forced
-			to disk before the rename() operation is
-			commited.  This provides roughly the same level
-			of guarantees as ext3, and avoids the
-			"zero-length" problem that can happen when a
-			system crashes before the delayed allocation
-			blocks are forced to disk.
-
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/trunk/Documentation/filesystems/proc.txt b/trunk/Documentation/filesystems/proc.txt
index efc4fd9f40ce..830bad7cce0f 100644
--- a/trunk/Documentation/filesystems/proc.txt
+++ b/trunk/Documentation/filesystems/proc.txt
@@ -940,6 +940,27 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
  File            Content                                        
  mb_groups       details of multiblock allocator buddy cache of free blocks
  mb_history      multiblock allocation history
+ stats           controls whether the multiblock allocator should start
+                 collecting statistics, which are shown during the unmount
+ group_prealloc  the multiblock allocator will round up allocation
+                 requests to a multiple of this tuning parameter if the
+                 stripe size is not set in the ext4 superblock
+ max_to_scan     The maximum number of extents the multiblock allocator
+                 will search to find the best extent
+ min_to_scan     The minimum number of extents the multiblock allocator
+                 will search to find the best extent
+ order2_req      Tuning parameter which controls the minimum size for 
+                 requests (as a power of 2) where the buddy cache is
+                 used
+ stream_req      Files which have fewer blocks than this tunable
+                 parameter will have their blocks allocated out of a
+                 block group specific preallocation pool, so that small
+                 files are packed closely together.  Each large file
+                 will have its blocks allocated out of its own unique
+                 preallocation pool.
+inode_readahead  Tuning parameter which controls the maximum number of
+                 inode table blocks that ext4's inode table readahead
+                 algorithm will pre-read into the buffer cache
 ..............................................................................
 
 
diff --git a/trunk/arch/ia64/include/asm/intrinsics.h b/trunk/arch/ia64/include/asm/intrinsics.h
index 111ed5222892..c47830e26cb7 100644
--- a/trunk/arch/ia64/include/asm/intrinsics.h
+++ b/trunk/arch/ia64/include/asm/intrinsics.h
@@ -202,11 +202,7 @@ extern long ia64_cmpxchg_called_with_bad_pointer (void);
 
 #ifndef __ASSEMBLY__
 #if defined(CONFIG_PARAVIRT) && defined(__KERNEL__)
-#ifdef ASM_SUPPORTED
-# define IA64_INTRINSIC_API(name)	paravirt_ ## name
-#else
-# define IA64_INTRINSIC_API(name)	pv_cpu_ops.name
-#endif
+#define IA64_INTRINSIC_API(name)	pv_cpu_ops.name
 #define IA64_INTRINSIC_MACRO(name)	paravirt_ ## name
 #else
 #define IA64_INTRINSIC_API(name)	ia64_native_ ## name
diff --git a/trunk/arch/ia64/include/asm/mmu_context.h b/trunk/arch/ia64/include/asm/mmu_context.h
index 7f2a456603cb..040bc87db930 100644
--- a/trunk/arch/ia64/include/asm/mmu_context.h
+++ b/trunk/arch/ia64/include/asm/mmu_context.h
@@ -87,7 +87,7 @@ get_mmu_context (struct mm_struct *mm)
 	/* re-check, now that we've got the lock: */
 	context = mm->context;
 	if (context == 0) {
-		cpumask_clear(mm_cpumask(mm));
+		cpus_clear(mm->cpu_vm_mask);
 		if (ia64_ctx.next >= ia64_ctx.limit) {
 			ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
 					ia64_ctx.max_ctx, ia64_ctx.next);
@@ -166,8 +166,8 @@ activate_context (struct mm_struct *mm)
 
 	do {
 		context = get_mmu_context(mm);
-		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
-			cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
+		if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
+			cpu_set(smp_processor_id(), mm->cpu_vm_mask);
 		reload_context(context);
 		/*
 		 * in the unlikely event of a TLB-flush by another thread,
diff --git a/trunk/arch/ia64/include/asm/module.h b/trunk/arch/ia64/include/asm/module.h
index 908eaef42a08..d2da61e4c49b 100644
--- a/trunk/arch/ia64/include/asm/module.h
+++ b/trunk/arch/ia64/include/asm/module.h
@@ -16,12 +16,6 @@ struct mod_arch_specific {
 	struct elf64_shdr *got;		/* global offset table */
 	struct elf64_shdr *opd;		/* official procedure descriptors */
 	struct elf64_shdr *unwind;	/* unwind-table section */
-#ifdef CONFIG_PARAVIRT
-	struct elf64_shdr *paravirt_bundles;
-					/* paravirt_alt_bundle_patch table */
-	struct elf64_shdr *paravirt_insts;
-					/* paravirt_alt_inst_patch table */
-#endif
 	unsigned long gp;		/* global-pointer for module */
 
 	void *core_unw_table;		/* core unwind-table cookie returned by unwinder */
diff --git a/trunk/arch/ia64/include/asm/native/inst.h b/trunk/arch/ia64/include/asm/native/inst.h
index d2d46efb3e6e..0a1026cca4fa 100644
--- a/trunk/arch/ia64/include/asm/native/inst.h
+++ b/trunk/arch/ia64/include/asm/native/inst.h
@@ -30,9 +30,6 @@
 #define __paravirt_work_processed_syscall_target \
 						ia64_work_processed_syscall
 
-#define paravirt_fsyscall_table			ia64_native_fsyscall_table
-#define paravirt_fsys_bubble_down		ia64_native_fsys_bubble_down
-
 #ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK
 # define PARAVIRT_POISON	0xdeadbeefbaadf00d
 # define CLOBBER(clob)				\
@@ -77,11 +74,6 @@
 (pred)	mov reg = psr			\
 	CLOBBER(clob)
 
-#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
-(pred)	mov reg = ar.itc				\
-	CLOBBER(clob)					\
-	CLOBBER_PRED(pred_clob)
-
 #define MOV_TO_IFA(reg, clob)	\
 	mov cr.ifa = reg	\
 	CLOBBER(clob)
@@ -166,11 +158,6 @@
 #define RSM_PSR_DT		\
 	rsm psr.dt
 
-#define RSM_PSR_BE_I(clob0, clob1)	\
-	rsm psr.be | psr.i		\
-	CLOBBER(clob0)			\
-	CLOBBER(clob1)
-
 #define SSM_PSR_DT_AND_SRLZ_I	\
 	ssm psr.dt		\
 	;;			\
diff --git a/trunk/arch/ia64/include/asm/native/patchlist.h b/trunk/arch/ia64/include/asm/native/patchlist.h
deleted file mode 100644
index be16ca9311bf..000000000000
--- a/trunk/arch/ia64/include/asm/native/patchlist.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/******************************************************************************
- * arch/ia64/include/asm/native/inst.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#define __paravirt_start_gate_fsyscall_patchlist		\
-	__ia64_native_start_gate_fsyscall_patchlist
-#define __paravirt_end_gate_fsyscall_patchlist			\
-	__ia64_native_end_gate_fsyscall_patchlist
-#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist	\
-	__ia64_native_start_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist	\
-	__ia64_native_end_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_start_gate_vtop_patchlist			\
-	__ia64_native_start_gate_vtop_patchlist
-#define __paravirt_end_gate_vtop_patchlist			\
-	__ia64_native_end_gate_vtop_patchlist
-#define __paravirt_start_gate_mckinley_e9_patchlist		\
-	__ia64_native_start_gate_mckinley_e9_patchlist
-#define __paravirt_end_gate_mckinley_e9_patchlist		\
-	__ia64_native_end_gate_mckinley_e9_patchlist
diff --git a/trunk/arch/ia64/include/asm/native/pvchk_inst.h b/trunk/arch/ia64/include/asm/native/pvchk_inst.h
index 8d72962ec838..b8e6eb1090d7 100644
--- a/trunk/arch/ia64/include/asm/native/pvchk_inst.h
+++ b/trunk/arch/ia64/include/asm/native/pvchk_inst.h
@@ -180,11 +180,6 @@
 	IS_PRED_IN(pred)			\
 	IS_RREG_OUT(reg)			\
 	IS_RREG_CLOB(clob)
-#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
-	IS_PRED_IN(pred)				\
-	IS_PRED_CLOB(pred_clob)				\
-	IS_RREG_OUT(reg)				\
-	IS_RREG_CLOB(clob)
 #define MOV_TO_IFA(reg, clob)			\
 	IS_RREG_IN(reg)				\
 	IS_RREG_CLOB(clob)
@@ -251,9 +246,6 @@
 	IS_RREG_CLOB(clob2)
 #define RSM_PSR_DT				\
 	nop 0
-#define RSM_PSR_BE_I(clob0, clob1)		\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)
 #define SSM_PSR_DT_AND_SRLZ_I			\
 	nop 0
 #define BSW_0(clob0, clob1, clob2)		\
diff --git a/trunk/arch/ia64/include/asm/paravirt.h b/trunk/arch/ia64/include/asm/paravirt.h
index 2eb0a981a09a..2bf3636473fe 100644
--- a/trunk/arch/ia64/include/asm/paravirt.h
+++ b/trunk/arch/ia64/include/asm/paravirt.h
@@ -22,56 +22,6 @@
 #ifndef __ASM_PARAVIRT_H
 #define __ASM_PARAVIRT_H
 
-#ifndef __ASSEMBLY__
-/******************************************************************************
- * fsys related addresses
- */
-struct pv_fsys_data {
-	unsigned long *fsyscall_table;
-	void *fsys_bubble_down;
-};
-
-extern struct pv_fsys_data pv_fsys_data;
-
-unsigned long *paravirt_get_fsyscall_table(void);
-char *paravirt_get_fsys_bubble_down(void);
-
-/******************************************************************************
- * patchlist addresses for gate page
- */
-enum pv_gate_patchlist {
-	PV_GATE_START_FSYSCALL,
-	PV_GATE_END_FSYSCALL,
-
-	PV_GATE_START_BRL_FSYS_BUBBLE_DOWN,
-	PV_GATE_END_BRL_FSYS_BUBBLE_DOWN,
-
-	PV_GATE_START_VTOP,
-	PV_GATE_END_VTOP,
-
-	PV_GATE_START_MCKINLEY_E9,
-	PV_GATE_END_MCKINLEY_E9,
-};
-
-struct pv_patchdata {
-	unsigned long start_fsyscall_patchlist;
-	unsigned long end_fsyscall_patchlist;
-	unsigned long start_brl_fsys_bubble_down_patchlist;
-	unsigned long end_brl_fsys_bubble_down_patchlist;
-	unsigned long start_vtop_patchlist;
-	unsigned long end_vtop_patchlist;
-	unsigned long start_mckinley_e9_patchlist;
-	unsigned long end_mckinley_e9_patchlist;
-
-	void *gate_section;
-};
-
-extern struct pv_patchdata pv_patchdata;
-
-unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type);
-void *paravirt_get_gate_section(void);
-#endif
-
 #ifdef CONFIG_PARAVIRT_GUEST
 
 #define PARAVIRT_HYPERVISOR_TYPE_DEFAULT	0
@@ -118,14 +68,6 @@ struct pv_init_ops {
 	int (*arch_setup_nomca)(void);
 
 	void (*post_smp_prepare_boot_cpu)(void);
-
-#ifdef ASM_SUPPORTED
-	unsigned long (*patch_bundle)(void *sbundle, void *ebundle,
-				      unsigned long type);
-	unsigned long (*patch_inst)(unsigned long stag, unsigned long etag,
-				    unsigned long type);
-#endif
-	void (*patch_branch)(unsigned long tag, unsigned long type);
 };
 
 extern struct pv_init_ops pv_init_ops;
@@ -268,8 +210,6 @@ struct pv_time_ops {
 	int (*do_steal_accounting)(unsigned long *new_itm);
 
 	void (*clocksource_resume)(void);
-
-	unsigned long long (*sched_clock)(void);
 };
 
 extern struct pv_time_ops pv_time_ops;
@@ -287,11 +227,6 @@ paravirt_do_steal_accounting(unsigned long *new_itm)
 	return pv_time_ops.do_steal_accounting(new_itm);
 }
 
-static inline unsigned long long paravirt_sched_clock(void)
-{
-	return pv_time_ops.sched_clock();
-}
-
 #endif /* !__ASSEMBLY__ */
 
 #else
diff --git a/trunk/arch/ia64/include/asm/paravirt_patch.h b/trunk/arch/ia64/include/asm/paravirt_patch.h
deleted file mode 100644
index 128ff5db6e67..000000000000
--- a/trunk/arch/ia64/include/asm/paravirt_patch.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#ifndef __ASM_PARAVIRT_PATCH_H
-#define __ASM_PARAVIRT_PATCH_H
-
-#ifdef __ASSEMBLY__
-
-	.section .paravirt_branches, "a"
-	.previous
-#define PARAVIRT_PATCH_SITE_BR(type)		\
-	{					\
-	[1:] ;					\
-	br.cond.sptk.many 2f ;			\
-	nop.b 0 ;				\
-	nop.b 0;; ;				\
-	} ;					\
-	2:					\
-	.xdata8 ".paravirt_branches", 1b, type
-
-#else
-
-#include <linux/stringify.h>
-#include <asm/intrinsics.h>
-
-/* for binary patch */
-struct paravirt_patch_site_bundle {
-	void		*sbundle;
-	void		*ebundle;
-	unsigned long	type;
-};
-
-/* label means the beginning of new bundle */
-#define paravirt_alt_bundle(instr, privop)				\
-	"\t998:\n"							\
-	"\t" instr "\n"							\
-	"\t999:\n"							\
-	"\t.pushsection .paravirt_bundles, \"a\"\n"			\
-	"\t.popsection\n"						\
-	"\t.xdata8 \".paravirt_bundles\", 998b, 999b, "			\
-	__stringify(privop) "\n"
-
-
-struct paravirt_patch_bundle_elem {
-	const void	*sbundle;
-	const void	*ebundle;
-	unsigned long	type;
-};
-
-
-struct paravirt_patch_site_inst {
-	unsigned long	stag;
-	unsigned long	etag;
-	unsigned long	type;
-};
-
-#define paravirt_alt_inst(instr, privop)				\
-	"\t[998:]\n"							\
-	"\t" instr "\n"							\
-	"\t[999:]\n"							\
-	"\t.pushsection .paravirt_insts, \"a\"\n"			\
-	"\t.popsection\n"						\
-	"\t.xdata8 \".paravirt_insts\", 998b, 999b, "			\
-	__stringify(privop) "\n"
-
-struct paravirt_patch_site_branch {
-	unsigned long	tag;
-	unsigned long	type;
-};
-
-struct paravirt_patch_branch_target {
-	const void	*entry;
-	unsigned long	type;
-};
-
-void
-__paravirt_patch_apply_branch(
-	unsigned long tag, unsigned long type,
-	const struct paravirt_patch_branch_target *entries,
-	unsigned int nr_entries);
-
-void
-paravirt_patch_reloc_br(unsigned long tag, const void *target);
-
-void
-paravirt_patch_reloc_brl(unsigned long tag, const void *target);
-
-
-#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT)
-unsigned long
-ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
-
-unsigned long
-__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
-			      const struct paravirt_patch_bundle_elem *elems,
-			      unsigned long nelems,
-			      const struct paravirt_patch_bundle_elem **found);
-
-void
-paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
-			    const struct paravirt_patch_site_bundle *end);
-
-void
-paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
-			  const struct paravirt_patch_site_inst *end);
-
-void paravirt_patch_apply(void);
-#else
-#define paravirt_patch_apply_bundle(start, end)	do { } while (0)
-#define paravirt_patch_apply_inst(start, end)	do { } while (0)
-#define paravirt_patch_apply()			do { } while (0)
-#endif
-
-#endif /* !__ASSEMBLEY__ */
-
-#endif /* __ASM_PARAVIRT_PATCH_H */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "linux"
- * c-basic-offset: 8
- * tab-width: 8
- * indent-tabs-mode: t
- * End:
- */
diff --git a/trunk/arch/ia64/include/asm/paravirt_privop.h b/trunk/arch/ia64/include/asm/paravirt_privop.h
index 3d2951130b5f..33c8e55f5775 100644
--- a/trunk/arch/ia64/include/asm/paravirt_privop.h
+++ b/trunk/arch/ia64/include/asm/paravirt_privop.h
@@ -33,7 +33,7 @@
  */
 
 struct pv_cpu_ops {
-	void (*fc)(void *addr);
+	void (*fc)(unsigned long addr);
 	unsigned long (*thash)(unsigned long addr);
 	unsigned long (*get_cpuid)(int index);
 	unsigned long (*get_pmd)(int index);
@@ -60,18 +60,12 @@ extern unsigned long ia64_native_getreg_func(int regnum);
 /* Instructions paravirtualized for performance */
 /************************************************/
 
-#ifndef ASM_SUPPORTED
-#define paravirt_ssm_i()	pv_cpu_ops.ssm_i()
-#define paravirt_rsm_i()	pv_cpu_ops.rsm_i()
-#define __paravirt_getreg()	pv_cpu_ops.getreg()
-#endif
-
 /* mask for ia64_native_ssm/rsm() must be constant.("i" constraing).
  * static inline function doesn't satisfy it. */
 #define paravirt_ssm(mask)			\
 	do {					\
 		if ((mask) == IA64_PSR_I)	\
-			paravirt_ssm_i();	\
+			pv_cpu_ops.ssm_i();	\
 		else				\
 			ia64_native_ssm(mask);	\
 	} while (0)
@@ -79,7 +73,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
 #define paravirt_rsm(mask)			\
 	do {					\
 		if ((mask) == IA64_PSR_I)	\
-			paravirt_rsm_i();	\
+			pv_cpu_ops.rsm_i();	\
 		else				\
 			ia64_native_rsm(mask);	\
 	} while (0)
@@ -92,7 +86,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
 		if ((reg) == _IA64_REG_IP)			\
 			res = ia64_native_getreg(_IA64_REG_IP); \
 		else						\
-			res = __paravirt_getreg(reg);		\
+			res = pv_cpu_ops.getreg(reg);		\
 		res;						\
 	})
 
@@ -118,12 +112,6 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 
 #endif /* CONFIG_PARAVIRT */
 
-#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED)
-#define paravirt_dv_serialize_data()	ia64_dv_serialize_data()
-#else
-#define paravirt_dv_serialize_data()	/* nothing */
-#endif
-
 /* these routines utilize privilege-sensitive or performance-sensitive
  * privileged instructions so the code must be replaced with
  * paravirtualized versions */
@@ -133,349 +121,4 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 	IA64_PARAVIRT_ASM_FUNC(work_processed_syscall)
 #define ia64_leave_kernel		IA64_PARAVIRT_ASM_FUNC(leave_kernel)
 
-
-#if defined(CONFIG_PARAVIRT)
-/******************************************************************************
- * binary patching infrastructure
- */
-#define PARAVIRT_PATCH_TYPE_FC				1
-#define PARAVIRT_PATCH_TYPE_THASH			2
-#define PARAVIRT_PATCH_TYPE_GET_CPUID			3
-#define PARAVIRT_PATCH_TYPE_GET_PMD			4
-#define PARAVIRT_PATCH_TYPE_PTCGA			5
-#define PARAVIRT_PATCH_TYPE_GET_RR			6
-#define PARAVIRT_PATCH_TYPE_SET_RR			7
-#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4		8
-#define PARAVIRT_PATCH_TYPE_SSM_I			9
-#define PARAVIRT_PATCH_TYPE_RSM_I			10
-#define PARAVIRT_PATCH_TYPE_GET_PSR_I			11
-#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE	12
-
-/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */
-#define PARAVIRT_PATCH_TYPE_GETREG			0x10000000
-#define PARAVIRT_PATCH_TYPE_SETREG			0x20000000
-
-/*
- * struct task_struct* (*ia64_switch_to)(void* next_task);
- * void *ia64_leave_syscall;
- * void *ia64_work_processed_syscall
- * void *ia64_leave_kernel;
- */
-
-#define PARAVIRT_PATCH_TYPE_BR_START			0x30000000
-#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO		\
-	(PARAVIRT_PATCH_TYPE_BR_START + 0)
-#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL		\
-	(PARAVIRT_PATCH_TYPE_BR_START + 1)
-#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL	\
-	(PARAVIRT_PATCH_TYPE_BR_START + 2)
-#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL		\
-	(PARAVIRT_PATCH_TYPE_BR_START + 3)
-
-#ifdef ASM_SUPPORTED
-#include <asm/paravirt_patch.h>
-
-/*
- * pv_cpu_ops calling stub.
- * normal function call convension can't be written by gcc
- * inline assembly.
- *
- * from the caller's point of view,
- * the following registers will be clobbered.
- * r2, r3
- * r8-r15
- * r16, r17
- * b6, b7
- * p6-p15
- * ar.ccv
- *
- * from the callee's point of view ,
- * the following registers can be used.
- * r2, r3: scratch
- * r8: scratch, input argument0 and return value
- * r0-r15: scratch, input argument1-5
- * b6: return pointer
- * b7: scratch
- * p6-p15: scratch
- * ar.ccv: scratch
- *
- * other registers must not be changed. especially
- * b0: rp: preserved. gcc ignores b0 in clobbered register.
- * r16: saved gp
- */
-/* 5 bundles */
-#define __PARAVIRT_BR							\
-	";;\n"								\
-	"{ .mlx\n"							\
-	"nop 0\n"							\
-	"movl r2 = %[op_addr]\n"/* get function pointer address */	\
-	";;\n"								\
-	"}\n"								\
-	"1:\n"								\
-	"{ .mii\n"							\
-	"ld8 r2 = [r2]\n"	/* load function descriptor address */	\
-	"mov r17 = ip\n"	/* get ip to calc return address */	\
-	"mov r16 = gp\n"	/* save gp */				\
-	";;\n"								\
-	"}\n"								\
-	"{ .mii\n"							\
-	"ld8 r3 = [r2], 8\n"	/* load entry address */		\
-	"adds r17 =  1f - 1b, r17\n"	/* calculate return address */	\
-	";;\n"								\
-	"mov b7 = r3\n"		/* set entry address */			\
-	"}\n"								\
-	"{ .mib\n"							\
-	"ld8 gp = [r2]\n"	/* load gp value */			\
-	"mov b6 = r17\n"	/* set return address */		\
-	"br.cond.sptk.few b7\n"	/* intrinsics are very short isns */	\
-	"}\n"								\
-	"1:\n"								\
-	"{ .mii\n"							\
-	"mov gp = r16\n"	/* restore gp value */			\
-	"nop 0\n"							\
-	"nop 0\n"							\
-	";;\n"								\
-	"}\n"
-
-#define PARAVIRT_OP(op)				\
-	[op_addr] "i"(&pv_cpu_ops.op)
-
-#define PARAVIRT_TYPE(type)			\
-	PARAVIRT_PATCH_TYPE_ ## type
-
-#define PARAVIRT_REG_CLOBBERS0					\
-	"r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14",	\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_REG_CLOBBERS1					\
-	"r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14",		\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_REG_CLOBBERS2					\
-	"r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14",	\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_REG_CLOBBERS5					\
-	"r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/	\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_BR_CLOBBERS			\
-	"b6", "b7"
-
-#define PARAVIRT_PR_CLOBBERS						\
-	"p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"
-
-#define PARAVIRT_AR_CLOBBERS			\
-	"ar.ccv"
-
-#define PARAVIRT_CLOBBERS0			\
-		PARAVIRT_REG_CLOBBERS0,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_CLOBBERS1			\
-		PARAVIRT_REG_CLOBBERS1,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_CLOBBERS2			\
-		PARAVIRT_REG_CLOBBERS2,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_CLOBBERS5			\
-		PARAVIRT_REG_CLOBBERS5,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_BR0(op, type)					\
-	register unsigned long ia64_clobber asm ("r8");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      :	"=r"(ia64_clobber)			\
-		      : PARAVIRT_OP(op)				\
-		      : PARAVIRT_CLOBBERS0)
-
-#define PARAVIRT_BR0_RET(op, type)				\
-	register unsigned long ia64_intri_res asm ("r8");	\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      : "=r"(ia64_intri_res)			\
-		      : PARAVIRT_OP(op)				\
-		      : PARAVIRT_CLOBBERS0)
-
-#define PARAVIRT_BR1(op, type, arg1)				\
-	register unsigned long __##arg1 asm ("r8") = arg1;	\
-	register unsigned long ia64_clobber asm ("r8");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      :	"=r"(ia64_clobber)			\
-		      : PARAVIRT_OP(op), "0"(__##arg1)		\
-		      : PARAVIRT_CLOBBERS1)
-
-#define PARAVIRT_BR1_RET(op, type, arg1)			\
-	register unsigned long ia64_intri_res asm ("r8");	\
-	register unsigned long __##arg1 asm ("r8") = arg1;	\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      : "=r"(ia64_intri_res)			\
-		      : PARAVIRT_OP(op), "0"(__##arg1)		\
-		      : PARAVIRT_CLOBBERS1)
-
-#define PARAVIRT_BR1_VOID(op, type, arg1)			\
-	register void *__##arg1 asm ("r8") = arg1;		\
-	register unsigned long ia64_clobber asm ("r8");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      :	"=r"(ia64_clobber)			\
-		      : PARAVIRT_OP(op), "0"(__##arg1)		\
-		      : PARAVIRT_CLOBBERS1)
-
-#define PARAVIRT_BR2(op, type, arg1, arg2)				\
-	register unsigned long __##arg1 asm ("r8") = arg1;		\
-	register unsigned long __##arg2 asm ("r9") = arg2;		\
-	register unsigned long ia64_clobber1 asm ("r8");		\
-	register unsigned long ia64_clobber2 asm ("r9");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,		\
-					  PARAVIRT_TYPE(type))		\
-		      : "=r"(ia64_clobber1), "=r"(ia64_clobber2)	\
-		      : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2)	\
-		      : PARAVIRT_CLOBBERS2)
-
-
-#define PARAVIRT_DEFINE_CPU_OP0(op, type)		\
-	static inline void				\
-	paravirt_ ## op (void)				\
-	{						\
-		PARAVIRT_BR0(op, type);			\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type)		\
-	static inline unsigned long			\
-	paravirt_ ## op (void)				\
-	{						\
-		PARAVIRT_BR0_RET(op, type);		\
-		return ia64_intri_res;			\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type)		\
-	static inline void				\
-	paravirt_ ## op (void *arg1)			\
-	{						\
-		PARAVIRT_BR1_VOID(op, type, arg1);	\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP1(op, type)		\
-	static inline void				\
-	paravirt_ ## op (unsigned long arg1)		\
-	{						\
-		PARAVIRT_BR1(op, type, arg1);		\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type)		\
-	static inline unsigned long			\
-	paravirt_ ## op (unsigned long arg1)		\
-	{						\
-		PARAVIRT_BR1_RET(op, type, arg1);	\
-		return ia64_intri_res;			\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP2(op, type)		\
-	static inline void				\
-	paravirt_ ## op (unsigned long arg1,		\
-			 unsigned long arg2)		\
-	{						\
-		PARAVIRT_BR2(op, type, arg1, arg2);	\
-	}
-
-
-PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC);
-PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH)
-PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID)
-PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD)
-PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA)
-PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR)
-PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR)
-PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I)
-PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I)
-PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I)
-PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE)
-
-static inline void
-paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
-			unsigned long val2, unsigned long val3,
-			unsigned long val4)
-{
-	register unsigned long __val0 asm ("r8") = val0;
-	register unsigned long __val1 asm ("r9") = val1;
-	register unsigned long __val2 asm ("r10") = val2;
-	register unsigned long __val3 asm ("r11") = val3;
-	register unsigned long __val4 asm ("r14") = val4;
-
-	register unsigned long ia64_clobber0 asm ("r8");
-	register unsigned long ia64_clobber1 asm ("r9");
-	register unsigned long ia64_clobber2 asm ("r10");
-	register unsigned long ia64_clobber3 asm ("r11");
-	register unsigned long ia64_clobber4 asm ("r14");
-
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,
-					  PARAVIRT_TYPE(SET_RR0_TO_RR4))
-		      : "=r"(ia64_clobber0),
-			"=r"(ia64_clobber1),
-			"=r"(ia64_clobber2),
-			"=r"(ia64_clobber3),
-			"=r"(ia64_clobber4)
-		      : PARAVIRT_OP(set_rr0_to_rr4),
-			"0"(__val0), "1"(__val1), "2"(__val2),
-			"3"(__val3), "4"(__val4)
-		      : PARAVIRT_CLOBBERS5);
-}
-
-/* unsigned long paravirt_getreg(int reg) */
-#define __paravirt_getreg(reg)						\
-	({								\
-		register unsigned long ia64_intri_res asm ("r8");	\
-		register unsigned long __reg asm ("r8") = (reg);	\
-									\
-		BUILD_BUG_ON(!__builtin_constant_p(reg));		\
-		asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-						  PARAVIRT_TYPE(GETREG) \
-						  + (reg))		\
-			      : "=r"(ia64_intri_res)			\
-			      : PARAVIRT_OP(getreg), "0"(__reg)		\
-			      : PARAVIRT_CLOBBERS1);			\
-									\
-		ia64_intri_res;						\
-	})
-
-/* void paravirt_setreg(int reg, unsigned long val) */
-#define paravirt_setreg(reg, val)					\
-	do {								\
-		register unsigned long __val asm ("r8") = val;		\
-		register unsigned long __reg asm ("r9") = reg;		\
-		register unsigned long ia64_clobber1 asm ("r8");	\
-		register unsigned long ia64_clobber2 asm ("r9");	\
-									\
-		BUILD_BUG_ON(!__builtin_constant_p(reg));		\
-		asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-						  PARAVIRT_TYPE(SETREG) \
-						  + (reg))		\
-			      : "=r"(ia64_clobber1),			\
-				"=r"(ia64_clobber2)			\
-			      : PARAVIRT_OP(setreg),			\
-				"1"(__reg), "0"(__val)			\
-			      : PARAVIRT_CLOBBERS2);			\
-	} while (0)
-
-#endif /* ASM_SUPPORTED */
-#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */
-
 #endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */
diff --git a/trunk/arch/ia64/include/asm/smp.h b/trunk/arch/ia64/include/asm/smp.h
index 598408336251..21c402365d0e 100644
--- a/trunk/arch/ia64/include/asm/smp.h
+++ b/trunk/arch/ia64/include/asm/smp.h
@@ -126,8 +126,7 @@ extern void identify_siblings (struct cpuinfo_ia64 *);
 extern int is_multithreading_enabled(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
+extern void arch_send_call_function_ipi(cpumask_t mask);
 
 #else /* CONFIG_SMP */
 
diff --git a/trunk/arch/ia64/include/asm/timex.h b/trunk/arch/ia64/include/asm/timex.h
index 86c7db861180..4e03cfe74a0c 100644
--- a/trunk/arch/ia64/include/asm/timex.h
+++ b/trunk/arch/ia64/include/asm/timex.h
@@ -40,6 +40,5 @@ get_cycles (void)
 }
 
 extern void ia64_cpu_local_tick (void);
-extern unsigned long long ia64_native_sched_clock (void);
 
 #endif /* _ASM_IA64_TIMEX_H */
diff --git a/trunk/arch/ia64/include/asm/topology.h b/trunk/arch/ia64/include/asm/topology.h
index 7b4c8c70b2d1..f260dcf21515 100644
--- a/trunk/arch/ia64/include/asm/topology.h
+++ b/trunk/arch/ia64/include/asm/topology.h
@@ -112,6 +112,11 @@ void build_cpu_to_node_map(void);
 
 extern void arch_fix_phys_package_id(int num, u32 slot);
 
+#define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
+					CPU_MASK_ALL : \
+					node_to_cpumask(pcibus_to_node(bus)) \
+				)
+
 #define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
 				 cpu_all_mask :				\
 				 cpumask_of_node(pcibus_to_node(bus)))
diff --git a/trunk/arch/ia64/include/asm/xen/hypervisor.h b/trunk/arch/ia64/include/asm/xen/hypervisor.h
index e425227a418e..7a804e80fc67 100644
--- a/trunk/arch/ia64/include/asm/xen/hypervisor.h
+++ b/trunk/arch/ia64/include/asm/xen/hypervisor.h
@@ -33,6 +33,9 @@
 #ifndef _ASM_IA64_XEN_HYPERVISOR_H
 #define _ASM_IA64_XEN_HYPERVISOR_H
 
+#ifdef CONFIG_XEN
+
+#include <linux/init.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>	/* to compile feature.c */
 #include <xen/features.h>		/* to comiple xen-netfront.c */
@@ -40,32 +43,22 @@
 
 /* xen_domain_type is set before executing any C code by early_xen_setup */
 enum xen_domain_type {
-	XEN_NATIVE,	/* running on bare hardware */
-	XEN_PV_DOMAIN,	/* running in a PV domain */
-	XEN_HVM_DOMAIN,	/* running in a Xen hvm domain*/
+	XEN_NATIVE,
+	XEN_PV_DOMAIN,
+	XEN_HVM_DOMAIN,
 };
 
-#ifdef CONFIG_XEN
 extern enum xen_domain_type xen_domain_type;
-#else
-#define xen_domain_type		XEN_NATIVE
-#endif
 
 #define xen_domain()		(xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain()		(xen_domain() &&			\
-				 xen_domain_type == XEN_PV_DOMAIN)
-#define xen_hvm_domain()	(xen_domain() &&			\
-				 xen_domain_type == XEN_HVM_DOMAIN)
-
-#ifdef CONFIG_XEN_DOM0
-#define xen_initial_domain()	(xen_pv_domain() &&			\
+#define xen_pv_domain()		(xen_domain_type == XEN_PV_DOMAIN)
+#define xen_initial_domain()	(xen_pv_domain() && \
 				 (xen_start_info->flags & SIF_INITDOMAIN))
-#else
-#define xen_initial_domain()	(0)
-#endif
+#define xen_hvm_domain()	(xen_domain_type == XEN_HVM_DOMAIN)
 
+/* deprecated. remove this */
+#define is_running_on_xen()	(xen_domain_type == XEN_PV_DOMAIN)
 
-#ifdef CONFIG_XEN
 extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 
@@ -81,6 +74,16 @@ void force_evtchn_callback(void);
 
 /* For setup_arch() in arch/ia64/kernel/setup.c */
 void xen_ia64_enable_opt_feature(void);
+
+#else /* CONFIG_XEN */
+
+#define xen_domain()		(0)
+#define xen_pv_domain()		(0)
+#define xen_initial_domain()	(0)
+#define xen_hvm_domain()	(0)
+#define is_running_on_xen()	(0)	/* deprecated. remove this */
 #endif
 
+#define is_initial_xendomain()	(0)	/* deprecated. remove this */
+
 #endif /* _ASM_IA64_XEN_HYPERVISOR_H */
diff --git a/trunk/arch/ia64/include/asm/xen/inst.h b/trunk/arch/ia64/include/asm/xen/inst.h
index c53a47611208..19c2ae1d878a 100644
--- a/trunk/arch/ia64/include/asm/xen/inst.h
+++ b/trunk/arch/ia64/include/asm/xen/inst.h
@@ -33,9 +33,6 @@
 #define __paravirt_work_processed_syscall_target \
 						xen_work_processed_syscall
 
-#define paravirt_fsyscall_table			xen_fsyscall_table
-#define paravirt_fsys_bubble_down		xen_fsys_bubble_down
-
 #define MOV_FROM_IFA(reg)	\
 	movl reg = XSI_IFA;	\
 	;;			\
@@ -113,27 +110,6 @@
 .endm
 #define MOV_FROM_PSR(pred, reg, clob)	__MOV_FROM_PSR pred, reg, clob
 
-/* assuming ar.itc is read with interrupt disabled. */
-#define MOV_FROM_ITC(pred, pred_clob, reg, clob)		\
-(pred)	movl clob = XSI_ITC_OFFSET;				\
-	;;							\
-(pred)	ld8 clob = [clob];					\
-(pred)	mov reg = ar.itc;					\
-	;;							\
-(pred)	add reg = reg, clob;					\
-	;;							\
-(pred)	movl clob = XSI_ITC_LAST;				\
-	;;							\
-(pred)	ld8 clob = [clob];					\
-	;;							\
-(pred)	cmp.geu.unc pred_clob, p0 = clob, reg;			\
-	;;							\
-(pred_clob)	add reg = 1, clob;				\
-	;;							\
-(pred)	movl clob = XSI_ITC_LAST;				\
-	;;							\
-(pred)	st8 [clob] = reg
-
 
 #define MOV_TO_IFA(reg, clob)	\
 	movl clob = XSI_IFA;	\
@@ -386,10 +362,6 @@
 #define RSM_PSR_DT		\
 	XEN_HYPER_RSM_PSR_DT
 
-#define RSM_PSR_BE_I(clob0, clob1)	\
-	RSM_PSR_I(p0, clob0, clob1);	\
-	rum psr.be
-
 #define SSM_PSR_DT_AND_SRLZ_I	\
 	XEN_HYPER_SSM_PSR_DT
 
diff --git a/trunk/arch/ia64/include/asm/xen/interface.h b/trunk/arch/ia64/include/asm/xen/interface.h
index e951e740bdf2..f00fab40854d 100644
--- a/trunk/arch/ia64/include/asm/xen/interface.h
+++ b/trunk/arch/ia64/include/asm/xen/interface.h
@@ -209,15 +209,6 @@ struct mapped_regs {
 			unsigned long krs[8];	/* kernel registers */
 			unsigned long tmp[16];	/* temp registers
 						   (e.g. for hyperprivops) */
-
-			/* itc paravirtualization
-			 * vAR.ITC = mAR.ITC + itc_offset
-			 * itc_last is one which was lastly passed to
-			 * the guest OS in order to prevent it from
-			 * going backwords.
-			 */
-			unsigned long itc_offset;
-			unsigned long itc_last;
 		};
 	};
 };
diff --git a/trunk/arch/ia64/include/asm/xen/minstate.h b/trunk/arch/ia64/include/asm/xen/minstate.h
index c57fa910f2c9..4d92d9bbda7b 100644
--- a/trunk/arch/ia64/include/asm/xen/minstate.h
+++ b/trunk/arch/ia64/include/asm/xen/minstate.h
@@ -1,12 +1,3 @@
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-/* read ar.itc in advance, and use it before leaving bank 0 */
-#define XEN_ACCOUNT_GET_STAMP		\
-	MOV_FROM_ITC(pUStk, p6, r20, r2);
-#else
-#define XEN_ACCOUNT_GET_STAMP
-#endif
-
 /*
  * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
  * the minimum state necessary that allows us to turn psr.ic back
@@ -132,7 +123,7 @@
 	;;											\
 .mem.offset 0,0; st8.spill [r16]=r2,16;								\
 .mem.offset 8,0; st8.spill [r17]=r3,16;								\
-	XEN_ACCOUNT_GET_STAMP									\
+	ACCOUNT_GET_STAMP									\
 	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
 	;;											\
 	EXTRA;											\
diff --git a/trunk/arch/ia64/include/asm/xen/patchlist.h b/trunk/arch/ia64/include/asm/xen/patchlist.h
deleted file mode 100644
index eae944e88846..000000000000
--- a/trunk/arch/ia64/include/asm/xen/patchlist.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/******************************************************************************
- * arch/ia64/include/asm/xen/patchlist.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#define __paravirt_start_gate_fsyscall_patchlist		\
-	__xen_start_gate_fsyscall_patchlist
-#define __paravirt_end_gate_fsyscall_patchlist			\
-	__xen_end_gate_fsyscall_patchlist
-#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist	\
-	__xen_start_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist	\
-	__xen_end_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_start_gate_vtop_patchlist			\
-	__xen_start_gate_vtop_patchlist
-#define __paravirt_end_gate_vtop_patchlist			\
-	__xen_end_gate_vtop_patchlist
-#define __paravirt_start_gate_mckinley_e9_patchlist		\
-	__xen_start_gate_mckinley_e9_patchlist
-#define __paravirt_end_gate_mckinley_e9_patchlist		\
-	__xen_end_gate_mckinley_e9_patchlist
diff --git a/trunk/arch/ia64/include/asm/xen/privop.h b/trunk/arch/ia64/include/asm/xen/privop.h
index fb4ec5e0b066..71ec7546e100 100644
--- a/trunk/arch/ia64/include/asm/xen/privop.h
+++ b/trunk/arch/ia64/include/asm/xen/privop.h
@@ -55,8 +55,6 @@
 #define XSI_BANK1_R16			(XSI_BASE + XSI_BANK1_R16_OFS)
 #define XSI_BANKNUM			(XSI_BASE + XSI_BANKNUM_OFS)
 #define XSI_IHA				(XSI_BASE + XSI_IHA_OFS)
-#define XSI_ITC_OFFSET			(XSI_BASE + XSI_ITC_OFFSET_OFS)
-#define XSI_ITC_LAST			(XSI_BASE + XSI_ITC_LAST_OFS)
 #endif
 
 #ifndef __ASSEMBLY__
@@ -69,7 +67,7 @@
  *  may have different semantics depending on whether they are executed
  *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
  *  be allowed to execute directly, lest incorrect semantics result. */
-extern void xen_fc(void *addr);
+extern void xen_fc(unsigned long addr);
 extern unsigned long xen_thash(unsigned long addr);
 
 /* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
@@ -82,10 +80,8 @@ extern unsigned long xen_thash(unsigned long addr);
 extern unsigned long xen_get_cpuid(int index);
 extern unsigned long xen_get_pmd(int index);
 
-#ifndef ASM_SUPPORTED
 extern unsigned long xen_get_eflag(void);	/* see xen_ia64_getreg */
 extern void xen_set_eflag(unsigned long);	/* see xen_ia64_setreg */
-#endif
 
 /************************************************/
 /* Instructions paravirtualized for performance */
@@ -110,7 +106,6 @@ extern void xen_set_eflag(unsigned long);	/* see xen_ia64_setreg */
 #define xen_get_virtual_pend()		\
 	(*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
 
-#ifndef ASM_SUPPORTED
 /* Although all privileged operations can be left to trap and will
  * be properly handled by Xen, some are frequent enough that we use
  * hyperprivops for performance. */
@@ -128,7 +123,6 @@ extern void xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
 			       unsigned long val4);
 extern void xen_set_kr(unsigned long index, unsigned long val);
 extern void xen_ptcga(unsigned long addr, unsigned long size);
-#endif /* !ASM_SUPPORTED */
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/trunk/arch/ia64/kernel/Makefile b/trunk/arch/ia64/kernel/Makefile
index 5628e9a990a6..f2778f2c4fd9 100644
--- a/trunk/arch/ia64/kernel/Makefile
+++ b/trunk/arch/ia64/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y	:= head.o init_task.o vmlinux.lds
 
 obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o		\
+	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o		\
 	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
 	 unwind.o mca.o mca_asm.o topology.o dma-mapping.o
 
@@ -36,8 +36,7 @@ obj-$(CONFIG_PCI_MSI)		+= msi_ia64.o
 mca_recovery-y			+= mca_drv.o mca_drv_asm.o
 obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
 
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirtentry.o \
-				   paravirt_patch.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirtentry.o
 
 obj-$(CONFIG_IA64_ESI)		+= esi.o
 ifneq ($(CONFIG_IA64_ESI),)
@@ -46,13 +45,35 @@ endif
 obj-$(CONFIG_DMAR)		+= pci-dma.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 
+# The gate DSO image is built using a special linker script.
+targets += gate.so gate-syms.o
+
+extra-y += gate.so gate-syms.o gate.lds gate.o
+
 # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
 CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
 
-# The gate DSO image is built using a special linker script.
-include $(srctree)/arch/ia64/kernel/Makefile.gate
-# tell compiled for native
-CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE
+CPPFLAGS_gate.lds := -P -C -U$(ARCH)
+
+quiet_cmd_gate = GATE $@
+      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
+		     $(call ld-option, -Wl$(comma)--hash-style=sysv)
+$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
+	$(call if_changed,gate)
+
+$(obj)/built-in.o: $(obj)/gate-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
+
+GATECFLAGS_gate-syms.o = -r
+$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
+	$(call if_changed,gate)
+
+# gate-data.o contains the gate DSO image as data in section .data.gate.
+# We must build gate.so before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/gate-data.o: $(obj)/gate.so
 
 # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
 define sed-y
@@ -88,9 +109,9 @@ include/asm-ia64/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
 clean-files += $(objtree)/include/asm-ia64/nr-irqs.h
 
 #
-# native ivt.S, entry.S and fsys.S
+# native ivt.S and entry.S
 #
-ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o
+ASM_PARAVIRT_OBJS = ivt.o entry.o
 define paravirtualized_native
 AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE
 AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK
diff --git a/trunk/arch/ia64/kernel/Makefile.gate b/trunk/arch/ia64/kernel/Makefile.gate
deleted file mode 100644
index 1d87f84069b3..000000000000
--- a/trunk/arch/ia64/kernel/Makefile.gate
+++ /dev/null
@@ -1,27 +0,0 @@
-# The gate DSO image is built using a special linker script.
-
-targets += gate.so gate-syms.o
-
-extra-y += gate.so gate-syms.o gate.lds gate.o
-
-CPPFLAGS_gate.lds := -P -C -U$(ARCH)
-
-quiet_cmd_gate = GATE $@
-      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
-		     $(call ld-option, -Wl$(comma)--hash-style=sysv)
-$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-$(obj)/built-in.o: $(obj)/gate-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
-
-GATECFLAGS_gate-syms.o = -r
-$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-# gate-data.o contains the gate DSO image as data in section .data.gate.
-# We must build gate.so before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/trunk/arch/ia64/kernel/acpi.c b/trunk/arch/ia64/kernel/acpi.c
index 5510317db37b..bdef2ce38c8b 100644
--- a/trunk/arch/ia64/kernel/acpi.c
+++ b/trunk/arch/ia64/kernel/acpi.c
@@ -890,7 +890,7 @@ __init void prefill_possible_map(void)
 		possible, max((possible - available_cpus), 0));
 
 	for (i = 0; i < possible; i++)
-		set_cpu_possible(i, true);
+		cpu_set(i, cpu_possible_map);
 }
 
 int acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@ -928,9 +928,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	buffer.length = ACPI_ALLOCATE_BUFFER;
 	buffer.pointer = NULL;
 
-	cpumask_complement(&tmp_map, cpu_present_mask);
-	cpu = cpumask_first(&tmp_map);
-	if (cpu >= nr_cpu_ids)
+	cpus_complement(tmp_map, cpu_present_map);
+	cpu = first_cpu(tmp_map);
+	if (cpu >= NR_CPUS)
 		return -EINVAL;
 
 	acpi_map_cpu2node(handle, cpu, physid);
diff --git a/trunk/arch/ia64/kernel/asm-offsets.c b/trunk/arch/ia64/kernel/asm-offsets.c
index af5650169043..742dbb1d5a4f 100644
--- a/trunk/arch/ia64/kernel/asm-offsets.c
+++ b/trunk/arch/ia64/kernel/asm-offsets.c
@@ -316,7 +316,5 @@ void foo(void)
 	DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
 	DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat);
 	DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat);
-	DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset);
-	DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last);
 #endif /* CONFIG_XEN */
 }
diff --git a/trunk/arch/ia64/kernel/efi.c b/trunk/arch/ia64/kernel/efi.c
index 7ef80e8161ce..efaff15d8cf1 100644
--- a/trunk/arch/ia64/kernel/efi.c
+++ b/trunk/arch/ia64/kernel/efi.c
@@ -456,7 +456,6 @@ efi_map_pal_code (void)
 		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
 		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
 		 IA64_GRANULE_SHIFT);
-	paravirt_dv_serialize_data();
 	ia64_set_psr(psr);		/* restore psr */
 }
 
diff --git a/trunk/arch/ia64/kernel/entry.S b/trunk/arch/ia64/kernel/entry.S
index ccfdeee9d89f..e5341e2c1175 100644
--- a/trunk/arch/ia64/kernel/entry.S
+++ b/trunk/arch/ia64/kernel/entry.S
@@ -735,7 +735,7 @@ GLOBAL_ENTRY(__paravirt_leave_syscall)
 __paravirt_work_processed_syscall:
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 	adds r2=PT(LOADRS)+16,r12
-	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
+(pUStk)	mov.m r22=ar.itc			// fetch time at leave
 	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
 	;;
 (p6)	ld4 r31=[r18]				// load current_thread_info()->flags
@@ -984,7 +984,7 @@ GLOBAL_ENTRY(__paravirt_leave_kernel)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 	.pred.rel.mutex pUStk,pKStk
 	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
-	MOV_FROM_ITC(pUStk, p9, r22, r29)	// M  fetch time at leave
+(pUStk)	mov.m r22=ar.itc	// M  fetch time at leave
 	nop.i 0
 	;;
 #else
diff --git a/trunk/arch/ia64/kernel/fsys.S b/trunk/arch/ia64/kernel/fsys.S
index 3567d54f8cee..c1625c7e1779 100644
--- a/trunk/arch/ia64/kernel/fsys.S
+++ b/trunk/arch/ia64/kernel/fsys.S
@@ -25,7 +25,6 @@
 #include <asm/unistd.h>
 
 #include "entry.h"
-#include "paravirt_inst.h"
 
 /*
  * See Documentation/ia64/fsys.txt for details on fsyscalls.
@@ -280,7 +279,7 @@ ENTRY(fsys_gettimeofday)
 (p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
 	;;
 	.pred.rel.mutex p8,p9
-	MOV_FROM_ITC(p8, p6, r2, r10)	// CPU_TIMER. 36 clocks latency!!!
+(p8)	mov r2 = ar.itc		// CPU_TIMER. 36 clocks latency!!!
 (p9)	ld8 r2 = [r30]		// MMIO_TIMER. Could also have latency issues..
 (p13)	ld8 r25 = [r19]		// get itc_lastcycle value
 	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET	// tv_sec
@@ -419,7 +418,7 @@ EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
 	mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
 	;;
 
-	RSM_PSR_I(p0, r18, r19)			// mask interrupt delivery
+	rsm psr.i				// mask interrupt delivery
 	mov ar.ccv=0
 	andcm r14=r14,r17			// filter out SIGKILL & SIGSTOP
 
@@ -492,7 +491,7 @@ EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
 #ifdef CONFIG_SMP
 	st4.rel [r31]=r0			// release the lock
 #endif
-	SSM_PSR_I(p0, p9, r31)
+	ssm psr.i
 	;;
 
 	srlz.d					// ensure psr.i is set again
@@ -514,7 +513,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
 #ifdef CONFIG_SMP
 	st4.rel [r31]=r0			// release the lock
 #endif
-	SSM_PSR_I(p0, p9, r17)
+	ssm psr.i
 	;;
 	srlz.d
 	br.sptk.many fsys_fallback_syscall	// with signal pending, do the heavy-weight syscall
@@ -522,7 +521,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
 #ifdef CONFIG_SMP
 .lock_contention:
 	/* Rather than spinning here, fall back on doing a heavy-weight syscall.  */
-	SSM_PSR_I(p0, p9, r17)
+	ssm psr.i
 	;;
 	srlz.d
 	br.sptk.many fsys_fallback_syscall
@@ -593,17 +592,17 @@ ENTRY(fsys_fallback_syscall)
 	adds r17=-1024,r15
 	movl r14=sys_call_table
 	;;
-	RSM_PSR_I(p0, r26, r27)
+	rsm psr.i
 	shladd r18=r17,3,r14
 	;;
 	ld8 r18=[r18]				// load normal (heavy-weight) syscall entry-point
-	MOV_FROM_PSR(p0, r29, r26)		// read psr (12 cyc load latency)
+	mov r29=psr				// read psr (12 cyc load latency)
 	mov r27=ar.rsc
 	mov r21=ar.fpsr
 	mov r26=ar.pfs
 END(fsys_fallback_syscall)
 	/* FALL THROUGH */
-GLOBAL_ENTRY(paravirt_fsys_bubble_down)
+GLOBAL_ENTRY(fsys_bubble_down)
 	.prologue
 	.altrp b6
 	.body
@@ -641,7 +640,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	 *
 	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
 	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
-	 * PSR.I  : already turned off by the time paravirt_fsys_bubble_down gets
+	 * PSR.I  : already turned off by the time fsys_bubble_down gets
 	 *	    invoked
 	 * PSR.DFL: always 0 (kernel never turns it on)
 	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
@@ -651,7 +650,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	 * PSR.DB : don't care --- kernel never enables kernel-level
 	 *	    breakpoints
 	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
-	 *          __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down
+	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
 	 *          will trigger a taken branch; the taken-trap-handler then
 	 *          converts the syscall into a break-based system-call.
 	 */
@@ -684,7 +683,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	;;
 	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	MOV_FROM_ITC(p0, p6, r30, r23)		// M    get cycle for accounting
+	mov.m r30=ar.itc			// M    get cycle for accounting
 #else
 	nop.m 0
 #endif
@@ -735,21 +734,21 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	mov rp=r14				// I0   set the real return addr
 	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
 	;;
-	SSM_PSR_I(p0, p6, r22)			// M2   we're on kernel stacks now, reenable irqs
+	ssm psr.i				// M2   we're on kernel stacks now, reenable irqs
 	cmp.eq p8,p0=r3,r0			// A
 (p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
 
 	nop.m 0
 (p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
 	br.cond.spnt ia64_trace_syscall		// B
-END(paravirt_fsys_bubble_down)
+END(fsys_bubble_down)
 
 	.rodata
 	.align 8
-	.globl paravirt_fsyscall_table
+	.globl fsyscall_table
 
-	data8 paravirt_fsys_bubble_down
-paravirt_fsyscall_table:
+	data8 fsys_bubble_down
+fsyscall_table:
 	data8 fsys_ni_syscall
 	data8 0				// exit			// 1025
 	data8 0				// read
@@ -1034,4 +1033,4 @@ paravirt_fsyscall_table:
 
 	// fill in zeros for the remaining entries
 	.zero:
-	.space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0
+	.space fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/trunk/arch/ia64/kernel/gate.S b/trunk/arch/ia64/kernel/gate.S
index cf5e0a105e16..74b1ccce4e84 100644
--- a/trunk/arch/ia64/kernel/gate.S
+++ b/trunk/arch/ia64/kernel/gate.S
@@ -13,7 +13,6 @@
 #include <asm/sigcontext.h>
 #include <asm/system.h>
 #include <asm/unistd.h>
-#include "paravirt_inst.h"
 
 /*
  * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
@@ -49,6 +48,87 @@ GLOBAL_ENTRY(__kernel_syscall_via_break)
 }
 END(__kernel_syscall_via_break)
 
+/*
+ * On entry:
+ *	r11 = saved ar.pfs
+ *	r15 = system call #
+ *	b0  = saved return address
+ *	b6  = return address
+ * On exit:
+ *	r11 = saved ar.pfs
+ *	r15 = system call #
+ *	b0  = saved return address
+ *	all other "scratch" registers:	undefined
+ *	all "preserved" registers:	same as on entry
+ */
+
+GLOBAL_ENTRY(__kernel_syscall_via_epc)
+	.prologue
+	.altrp b6
+	.body
+{
+	/*
+	 * Note: the kernel cannot assume that the first two instructions in this
+	 * bundle get executed.  The remaining code must be safe even if
+	 * they do not get executed.
+	 */
+	adds r17=-1024,r15			// A
+	mov r10=0				// A    default to successful syscall execution
+	epc					// B	causes split-issue
+}
+	;;
+	rsm psr.be | psr.i			// M2 (5 cyc to srlz.d)
+	LOAD_FSYSCALL_TABLE(r14)		// X
+	;;
+	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
+	shladd r18=r17,3,r14			// A
+	mov r19=NR_syscalls-1			// A
+	;;
+	lfetch [r18]				// M0|1
+	mov r29=psr				// M2 (12 cyc)
+	// If r17 is a NaT, p6 will be zero
+	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
+	;;
+	mov r21=ar.fpsr				// M2 (12 cyc)
+	tnat.nz p10,p9=r15			// I0
+	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
+	;;
+	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
+(p6)	ld8 r18=[r18]				// M0|1
+	nop.i 0
+	;;
+	nop.m 0
+(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
+	nop.i 0
+	;;
+(p8)	ssm psr.i
+(p6)	mov b7=r18				// I0
+(p8)	br.dptk.many b7				// B
+
+	mov r27=ar.rsc				// M2 (12 cyc)
+/*
+ * brl.cond doesn't work as intended because the linker would convert this branch
+ * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
+ * future version of the linker.  In the meantime, we just use an indirect branch
+ * instead.
+ */
+#ifdef CONFIG_ITANIUM
+(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
+	;;
+(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
+	;;
+(p6)	mov b7=r14
+(p6)	br.sptk.many b7
+#else
+	BRL_COND_FSYS_BUBBLE_DOWN(p6)
+#endif
+	ssm psr.i
+	mov r10=-1
+(p10)	mov r8=EINVAL
+(p9)	mov r8=ENOSYS
+	FSYS_RETURN
+END(__kernel_syscall_via_epc)
+
 #	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
 #	define ARG1_OFF		(16 + IA64_SIGFRAME_ARG1_OFFSET)
 #	define ARG2_OFF		(16 + IA64_SIGFRAME_ARG2_OFFSET)
@@ -294,92 +374,3 @@ restore_rbs:
 	// invala not necessary as that will happen when returning to user-mode
 	br.cond.sptk back_from_restore_rbs
 END(__kernel_sigtramp)
-
-/*
- * On entry:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	b6  = return address
- * On exit:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	all other "scratch" registers:	undefined
- *	all "preserved" registers:	same as on entry
- */
-
-GLOBAL_ENTRY(__kernel_syscall_via_epc)
-	.prologue
-	.altrp b6
-	.body
-{
-	/*
-	 * Note: the kernel cannot assume that the first two instructions in this
-	 * bundle get executed.  The remaining code must be safe even if
-	 * they do not get executed.
-	 */
-	adds r17=-1024,r15			// A
-	mov r10=0				// A    default to successful syscall execution
-	epc					// B	causes split-issue
-}
-	;;
-	RSM_PSR_BE_I(r20, r22)			// M2 (5 cyc to srlz.d)
-	LOAD_FSYSCALL_TABLE(r14)		// X
-	;;
-	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
-	shladd r18=r17,3,r14			// A
-	mov r19=NR_syscalls-1			// A
-	;;
-	lfetch [r18]				// M0|1
-	MOV_FROM_PSR(p0, r29, r8)		// M2 (12 cyc)
-	// If r17 is a NaT, p6 will be zero
-	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
-	;;
-	mov r21=ar.fpsr				// M2 (12 cyc)
-	tnat.nz p10,p9=r15			// I0
-	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
-	;;
-	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
-(p6)	ld8 r18=[r18]				// M0|1
-	nop.i 0
-	;;
-	nop.m 0
-(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
-	nop.i 0
-	;;
-	SSM_PSR_I(p8, p14, r25)
-(p6)	mov b7=r18				// I0
-(p8)	br.dptk.many b7				// B
-
-	mov r27=ar.rsc				// M2 (12 cyc)
-/*
- * brl.cond doesn't work as intended because the linker would convert this branch
- * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
- * future version of the linker.  In the meantime, we just use an indirect branch
- * instead.
- */
-#ifdef CONFIG_ITANIUM
-(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
-	;;
-(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
-	;;
-(p6)	mov b7=r14
-(p6)	br.sptk.many b7
-#else
-	BRL_COND_FSYS_BUBBLE_DOWN(p6)
-#endif
-	SSM_PSR_I(p0, p14, r10)
-	mov r10=-1
-(p10)	mov r8=EINVAL
-(p9)	mov r8=ENOSYS
-	FSYS_RETURN
-
-#ifdef CONFIG_PARAVIRT
-	/*
-	 * padd to make the size of this symbol constant
-	 * independent of paravirtualization.
-	 */
-	.align PAGE_SIZE / 8
-#endif
-END(__kernel_syscall_via_epc)
diff --git a/trunk/arch/ia64/kernel/gate.lds.S b/trunk/arch/ia64/kernel/gate.lds.S
index 88c64ed47c36..3cb1abc00e24 100644
--- a/trunk/arch/ia64/kernel/gate.lds.S
+++ b/trunk/arch/ia64/kernel/gate.lds.S
@@ -7,7 +7,6 @@
 
 
 #include <asm/system.h>
-#include "paravirt_patchlist.h"
 
 SECTIONS
 {
@@ -34,21 +33,21 @@ SECTIONS
 	. = GATE_ADDR + 0x600;
 
 	.data.patch		: {
-		__paravirt_start_gate_mckinley_e9_patchlist = .;
+		__start_gate_mckinley_e9_patchlist = .;
 		*(.data.patch.mckinley_e9)
-		__paravirt_end_gate_mckinley_e9_patchlist = .;
+		__end_gate_mckinley_e9_patchlist = .;
 
-		__paravirt_start_gate_vtop_patchlist = .;
+		__start_gate_vtop_patchlist = .;
 		*(.data.patch.vtop)
-		__paravirt_end_gate_vtop_patchlist = .;
+		__end_gate_vtop_patchlist = .;
 
-		__paravirt_start_gate_fsyscall_patchlist = .;
+		__start_gate_fsyscall_patchlist = .;
 		*(.data.patch.fsyscall_table)
-		__paravirt_end_gate_fsyscall_patchlist = .;
+		__end_gate_fsyscall_patchlist = .;
 
-		__paravirt_start_gate_brl_fsys_bubble_down_patchlist = .;
+		__start_gate_brl_fsys_bubble_down_patchlist = .;
 		*(.data.patch.brl_fsys_bubble_down)
-		__paravirt_end_gate_brl_fsys_bubble_down_patchlist = .;
+		__end_gate_brl_fsys_bubble_down_patchlist = .;
 	}						:readable
 
 	.IA_64.unwind_info	: { *(.IA_64.unwind_info*) }
diff --git a/trunk/arch/ia64/kernel/head.S b/trunk/arch/ia64/kernel/head.S
index 23f846de62d5..59301c472800 100644
--- a/trunk/arch/ia64/kernel/head.S
+++ b/trunk/arch/ia64/kernel/head.S
@@ -1050,7 +1050,7 @@ END(ia64_delay_loop)
  * except that the multiplication and the shift are done with 128-bit
  * intermediate precision so that we can produce a full 64-bit result.
  */
-GLOBAL_ENTRY(ia64_native_sched_clock)
+GLOBAL_ENTRY(sched_clock)
 	addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
 	mov.m r9=ar.itc		// fetch cycle-counter				(35 cyc)
 	;;
@@ -1066,13 +1066,7 @@ GLOBAL_ENTRY(ia64_native_sched_clock)
 	;;
 	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
 	br.ret.sptk.many rp
-END(ia64_native_sched_clock)
-#ifndef CONFIG_PARAVIRT
-	//unsigned long long
-	//sched_clock(void) __attribute__((alias("ia64_native_sched_clock")));
-	.global sched_clock
-sched_clock = ia64_native_sched_clock
-#endif
+END(sched_clock)
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 GLOBAL_ENTRY(cycle_to_cputime)
diff --git a/trunk/arch/ia64/kernel/ivt.S b/trunk/arch/ia64/kernel/ivt.S
index ec9a5fdfa1b9..f675d8e33853 100644
--- a/trunk/arch/ia64/kernel/ivt.S
+++ b/trunk/arch/ia64/kernel/ivt.S
@@ -804,7 +804,7 @@ ENTRY(break_fault)
 ///////////////////////////////////////////////////////////////////////
 	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	MOV_FROM_ITC(p0, p14, r30, r18)		// M    get cycle for accounting
+	mov.m r30=ar.itc			// M    get cycle for accounting
 #else
 	mov b6=r30				// I0   setup syscall handler branch reg early
 #endif
diff --git a/trunk/arch/ia64/kernel/mca.c b/trunk/arch/ia64/kernel/mca.c
index 8f33a8840422..bab1de2d2f6a 100644
--- a/trunk/arch/ia64/kernel/mca.c
+++ b/trunk/arch/ia64/kernel/mca.c
@@ -1456,9 +1456,9 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
 
 	ia64_mca_cmc_int_handler(cmc_irq, arg);
 
-	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
+	for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
 
-	if (cpuid < nr_cpu_ids) {
+	if (cpuid < NR_CPUS) {
 		platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
 	} else {
 		/* If no log record, switch out of polling mode */
@@ -1525,7 +1525,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
 
 	ia64_mca_cpe_int_handler(cpe_irq, arg);
 
-	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
+	for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
 
 	if (cpuid < NR_CPUS) {
 		platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
diff --git a/trunk/arch/ia64/kernel/module.c b/trunk/arch/ia64/kernel/module.c
index da3b0cf495a3..aaa7d901521f 100644
--- a/trunk/arch/ia64/kernel/module.c
+++ b/trunk/arch/ia64/kernel/module.c
@@ -446,14 +446,6 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
 			mod->arch.opd = s;
 		else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
 			mod->arch.unwind = s;
-#ifdef CONFIG_PARAVIRT
-		else if (strcmp(".paravirt_bundles",
-				secstrings + s->sh_name) == 0)
-			mod->arch.paravirt_bundles = s;
-		else if (strcmp(".paravirt_insts",
-				secstrings + s->sh_name) == 0)
-			mod->arch.paravirt_insts = s;
-#endif
 
 	if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
 		printk(KERN_ERR "%s: sections missing\n", mod->name);
@@ -533,7 +525,8 @@ get_ltoff (struct module *mod, uint64_t value, int *okp)
 			goto found;
 
 	/* Not enough GOT entries? */
-	BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
+	if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size))
+		BUG();
 
 	e->val = value;
 	++mod->arch.next_got_entry;
@@ -928,30 +921,6 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo
 	DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
 	if (mod->arch.unwind)
 		register_unwind_table(mod);
-#ifdef CONFIG_PARAVIRT
-        if (mod->arch.paravirt_bundles) {
-                struct paravirt_patch_site_bundle *start =
-                        (struct paravirt_patch_site_bundle *)
-                        mod->arch.paravirt_bundles->sh_addr;
-                struct paravirt_patch_site_bundle *end =
-                        (struct paravirt_patch_site_bundle *)
-                        (mod->arch.paravirt_bundles->sh_addr +
-                         mod->arch.paravirt_bundles->sh_size);
-
-                paravirt_patch_apply_bundle(start, end);
-        }
-        if (mod->arch.paravirt_insts) {
-                struct paravirt_patch_site_inst *start =
-                        (struct paravirt_patch_site_inst *)
-                        mod->arch.paravirt_insts->sh_addr;
-                struct paravirt_patch_site_inst *end =
-                        (struct paravirt_patch_site_inst *)
-                        (mod->arch.paravirt_insts->sh_addr +
-                         mod->arch.paravirt_insts->sh_size);
-
-                paravirt_patch_apply_inst(start, end);
-        }
-#endif
 	return 0;
 }
 
diff --git a/trunk/arch/ia64/kernel/paravirt.c b/trunk/arch/ia64/kernel/paravirt.c
index a21d7bb9c69c..9f14c16f6369 100644
--- a/trunk/arch/ia64/kernel/paravirt.c
+++ b/trunk/arch/ia64/kernel/paravirt.c
@@ -46,23 +46,13 @@ struct pv_info pv_info = {
  * initialization hooks.
  */
 
-static void __init
-ia64_native_patch_branch(unsigned long tag, unsigned long type);
-
-struct pv_init_ops pv_init_ops =
-{
-#ifdef ASM_SUPPORTED
-	.patch_bundle = ia64_native_patch_bundle,
-#endif
-	.patch_branch = ia64_native_patch_branch,
-};
+struct pv_init_ops pv_init_ops;
 
 /***************************************************************************
  * pv_cpu_ops
  * intrinsics hooks.
  */
 
-#ifndef ASM_SUPPORTED
 /* ia64_native_xxx are macros so that we have to make them real functions */
 
 #define DEFINE_VOID_FUNC1(name)					\
@@ -70,14 +60,7 @@ struct pv_init_ops pv_init_ops =
 	ia64_native_ ## name ## _func(unsigned long arg)	\
 	{							\
 		ia64_native_ ## name(arg);			\
-	}
-
-#define DEFINE_VOID_FUNC1_VOID(name)				\
-	static void						\
-	ia64_native_ ## name ## _func(void *arg)		\
-	{							\
-		ia64_native_ ## name(arg);			\
-	}
+	}							\
 
 #define DEFINE_VOID_FUNC2(name)					\
 	static void						\
@@ -85,7 +68,7 @@ struct pv_init_ops pv_init_ops =
 				      unsigned long arg1)	\
 	{							\
 		ia64_native_ ## name(arg0, arg1);		\
-	}
+	}							\
 
 #define DEFINE_FUNC0(name)			\
 	static unsigned long			\
@@ -101,7 +84,7 @@ struct pv_init_ops pv_init_ops =
 		return ia64_native_ ## name(arg);	\
 	}						\
 
-DEFINE_VOID_FUNC1_VOID(fc);
+DEFINE_VOID_FUNC1(fc);
 DEFINE_VOID_FUNC1(intrin_local_irq_restore);
 
 DEFINE_VOID_FUNC2(ptcga);
@@ -291,266 +274,6 @@ ia64_native_setreg_func(int regnum, unsigned long val)
 		break;
 	}
 }
-#else
-
-#define __DEFINE_FUNC(name, code)					\
-	extern const char ia64_native_ ## name ## _direct_start[];	\
-	extern const char ia64_native_ ## name ## _direct_end[];	\
-	asm (".align 32\n"						\
-	     ".proc ia64_native_" #name "_func\n"			\
-	     "ia64_native_" #name "_func:\n"				\
-	     "ia64_native_" #name "_direct_start:\n"			\
-	     code							\
-	     "ia64_native_" #name "_direct_end:\n"			\
-	     "br.cond.sptk.many b6\n"					\
-	     ".endp ia64_native_" #name "_func\n")
-
-#define DEFINE_VOID_FUNC0(name, code)				\
-	extern void						\
-	ia64_native_ ## name ## _func(void);			\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC1(name, code)				\
-	extern void						\
-	ia64_native_ ## name ## _func(unsigned long arg);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC1_VOID(name, code)			\
-	extern void						\
-	ia64_native_ ## name ## _func(void *arg);		\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC2(name, code)				\
-	extern void						\
-	ia64_native_ ## name ## _func(unsigned long arg0,	\
-				      unsigned long arg1);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_FUNC0(name, code)		\
-	extern unsigned long			\
-	ia64_native_ ## name ## _func(void);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_FUNC1(name, type, code)			\
-	extern unsigned long				\
-	ia64_native_ ## name ## _func(type arg);	\
-	__DEFINE_FUNC(name, code)
-
-DEFINE_VOID_FUNC1_VOID(fc,
-		       "fc r8\n");
-DEFINE_VOID_FUNC1(intrin_local_irq_restore,
-		  ";;\n"
-		  "     cmp.ne p6, p7 = r8, r0\n"
-		  ";;\n"
-		  "(p6) ssm psr.i\n"
-		  "(p7) rsm psr.i\n"
-		  ";;\n"
-		  "(p6) srlz.d\n");
-
-DEFINE_VOID_FUNC2(ptcga,
-		  "ptc.ga r8, r9\n");
-DEFINE_VOID_FUNC2(set_rr,
-		  "mov rr[r8] = r9\n");
-
-/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */
-DEFINE_FUNC0(get_psr_i,
-	     "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n"
-	     "mov r8 = psr\n"
-	     ";;\n"
-	     "and r8 = r2, r8\n");
-
-DEFINE_FUNC1(thash, unsigned long,
-	     "thash r8 = r8\n");
-DEFINE_FUNC1(get_cpuid, int,
-	     "mov r8 = cpuid[r8]\n");
-DEFINE_FUNC1(get_pmd, int,
-	     "mov r8 = pmd[r8]\n");
-DEFINE_FUNC1(get_rr, unsigned long,
-	     "mov r8 = rr[r8]\n");
-
-DEFINE_VOID_FUNC0(ssm_i,
-		  "ssm psr.i\n");
-DEFINE_VOID_FUNC0(rsm_i,
-		  "rsm psr.i\n");
-
-extern void
-ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
-				unsigned long val2, unsigned long val3,
-				unsigned long val4);
-__DEFINE_FUNC(set_rr0_to_rr4,
-	      "mov rr[r0] = r8\n"
-	      "movl r2 = 0x2000000000000000\n"
-	      ";;\n"
-	      "mov rr[r2] = r9\n"
-	      "shl r3 = r2, 1\n"	/* movl r3 = 0x4000000000000000 */
-	      ";;\n"
-	      "add r2 = r2, r3\n"	/* movl r2 = 0x6000000000000000 */
-	      "mov rr[r3] = r10\n"
-	      ";;\n"
-	      "mov rr[r2] = r11\n"
-	      "shl r3 = r3, 1\n"	/* movl r3 = 0x8000000000000000 */
-	      ";;\n"
-	      "mov rr[r3] = r14\n");
-
-extern unsigned long ia64_native_getreg_func(int regnum);
-asm(".global ia64_native_getreg_func\n");
-#define __DEFINE_GET_REG(id, reg)			\
-	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
-	";;\n"						\
-	"cmp.eq p6, p0 = r2, r8\n"			\
-	";;\n"						\
-	"(p6) mov r8 = " #reg "\n"			\
-	"(p6) br.cond.sptk.many b6\n"			\
-	";;\n"
-#define __DEFINE_GET_AR(id, reg)	__DEFINE_GET_REG(AR_ ## id, ar.reg)
-#define __DEFINE_GET_CR(id, reg)	__DEFINE_GET_REG(CR_ ## id, cr.reg)
-
-__DEFINE_FUNC(getreg,
-	      __DEFINE_GET_REG(GP, gp)
-	      /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */
-	      __DEFINE_GET_REG(PSR, psr)
-	      __DEFINE_GET_REG(TP, tp)
-	      __DEFINE_GET_REG(SP, sp)
-
-	      __DEFINE_GET_REG(AR_KR0, ar0)
-	      __DEFINE_GET_REG(AR_KR1, ar1)
-	      __DEFINE_GET_REG(AR_KR2, ar2)
-	      __DEFINE_GET_REG(AR_KR3, ar3)
-	      __DEFINE_GET_REG(AR_KR4, ar4)
-	      __DEFINE_GET_REG(AR_KR5, ar5)
-	      __DEFINE_GET_REG(AR_KR6, ar6)
-	      __DEFINE_GET_REG(AR_KR7, ar7)
-	      __DEFINE_GET_AR(RSC, rsc)
-	      __DEFINE_GET_AR(BSP, bsp)
-	      __DEFINE_GET_AR(BSPSTORE, bspstore)
-	      __DEFINE_GET_AR(RNAT, rnat)
-	      __DEFINE_GET_AR(FCR, fcr)
-	      __DEFINE_GET_AR(EFLAG, eflag)
-	      __DEFINE_GET_AR(CSD, csd)
-	      __DEFINE_GET_AR(SSD, ssd)
-	      __DEFINE_GET_REG(AR_CFLAG, ar27)
-	      __DEFINE_GET_AR(FSR, fsr)
-	      __DEFINE_GET_AR(FIR, fir)
-	      __DEFINE_GET_AR(FDR, fdr)
-	      __DEFINE_GET_AR(CCV, ccv)
-	      __DEFINE_GET_AR(UNAT, unat)
-	      __DEFINE_GET_AR(FPSR, fpsr)
-	      __DEFINE_GET_AR(ITC, itc)
-	      __DEFINE_GET_AR(PFS, pfs)
-	      __DEFINE_GET_AR(LC, lc)
-	      __DEFINE_GET_AR(EC, ec)
-
-	      __DEFINE_GET_CR(DCR, dcr)
-	      __DEFINE_GET_CR(ITM, itm)
-	      __DEFINE_GET_CR(IVA, iva)
-	      __DEFINE_GET_CR(PTA, pta)
-	      __DEFINE_GET_CR(IPSR, ipsr)
-	      __DEFINE_GET_CR(ISR, isr)
-	      __DEFINE_GET_CR(IIP, iip)
-	      __DEFINE_GET_CR(IFA, ifa)
-	      __DEFINE_GET_CR(ITIR, itir)
-	      __DEFINE_GET_CR(IIPA, iipa)
-	      __DEFINE_GET_CR(IFS, ifs)
-	      __DEFINE_GET_CR(IIM, iim)
-	      __DEFINE_GET_CR(IHA, iha)
-	      __DEFINE_GET_CR(LID, lid)
-	      __DEFINE_GET_CR(IVR, ivr)
-	      __DEFINE_GET_CR(TPR, tpr)
-	      __DEFINE_GET_CR(EOI, eoi)
-	      __DEFINE_GET_CR(IRR0, irr0)
-	      __DEFINE_GET_CR(IRR1, irr1)
-	      __DEFINE_GET_CR(IRR2, irr2)
-	      __DEFINE_GET_CR(IRR3, irr3)
-	      __DEFINE_GET_CR(ITV, itv)
-	      __DEFINE_GET_CR(PMV, pmv)
-	      __DEFINE_GET_CR(CMCV, cmcv)
-	      __DEFINE_GET_CR(LRR0, lrr0)
-	      __DEFINE_GET_CR(LRR1, lrr1)
-
-	      "mov r8 = -1\n"	/* unsupported case */
-	);
-
-extern void ia64_native_setreg_func(int regnum, unsigned long val);
-asm(".global ia64_native_setreg_func\n");
-#define __DEFINE_SET_REG(id, reg)			\
-	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
-	";;\n"						\
-	"cmp.eq p6, p0 = r2, r9\n"			\
-	";;\n"						\
-	"(p6) mov " #reg " = r8\n"			\
-	"(p6) br.cond.sptk.many b6\n"			\
-	";;\n"
-#define __DEFINE_SET_AR(id, reg)	__DEFINE_SET_REG(AR_ ## id, ar.reg)
-#define __DEFINE_SET_CR(id, reg)	__DEFINE_SET_REG(CR_ ## id, cr.reg)
-__DEFINE_FUNC(setreg,
-	      "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n"
-	      ";;\n"
-	      "cmp.eq p6, p0 = r2, r9\n"
-	      ";;\n"
-	      "(p6) mov psr.l = r8\n"
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-	      ".serialize.data\n"
-#endif
-	      "(p6) br.cond.sptk.many b6\n"
-	      __DEFINE_SET_REG(GP, gp)
-	      __DEFINE_SET_REG(SP, sp)
-
-	      __DEFINE_SET_REG(AR_KR0, ar0)
-	      __DEFINE_SET_REG(AR_KR1, ar1)
-	      __DEFINE_SET_REG(AR_KR2, ar2)
-	      __DEFINE_SET_REG(AR_KR3, ar3)
-	      __DEFINE_SET_REG(AR_KR4, ar4)
-	      __DEFINE_SET_REG(AR_KR5, ar5)
-	      __DEFINE_SET_REG(AR_KR6, ar6)
-	      __DEFINE_SET_REG(AR_KR7, ar7)
-	      __DEFINE_SET_AR(RSC, rsc)
-	      __DEFINE_SET_AR(BSP, bsp)
-	      __DEFINE_SET_AR(BSPSTORE, bspstore)
-	      __DEFINE_SET_AR(RNAT, rnat)
-	      __DEFINE_SET_AR(FCR, fcr)
-	      __DEFINE_SET_AR(EFLAG, eflag)
-	      __DEFINE_SET_AR(CSD, csd)
-	      __DEFINE_SET_AR(SSD, ssd)
-	      __DEFINE_SET_REG(AR_CFLAG, ar27)
-	      __DEFINE_SET_AR(FSR, fsr)
-	      __DEFINE_SET_AR(FIR, fir)
-	      __DEFINE_SET_AR(FDR, fdr)
-	      __DEFINE_SET_AR(CCV, ccv)
-	      __DEFINE_SET_AR(UNAT, unat)
-	      __DEFINE_SET_AR(FPSR, fpsr)
-	      __DEFINE_SET_AR(ITC, itc)
-	      __DEFINE_SET_AR(PFS, pfs)
-	      __DEFINE_SET_AR(LC, lc)
-	      __DEFINE_SET_AR(EC, ec)
-
-	      __DEFINE_SET_CR(DCR, dcr)
-	      __DEFINE_SET_CR(ITM, itm)
-	      __DEFINE_SET_CR(IVA, iva)
-	      __DEFINE_SET_CR(PTA, pta)
-	      __DEFINE_SET_CR(IPSR, ipsr)
-	      __DEFINE_SET_CR(ISR, isr)
-	      __DEFINE_SET_CR(IIP, iip)
-	      __DEFINE_SET_CR(IFA, ifa)
-	      __DEFINE_SET_CR(ITIR, itir)
-	      __DEFINE_SET_CR(IIPA, iipa)
-	      __DEFINE_SET_CR(IFS, ifs)
-	      __DEFINE_SET_CR(IIM, iim)
-	      __DEFINE_SET_CR(IHA, iha)
-	      __DEFINE_SET_CR(LID, lid)
-	      __DEFINE_SET_CR(IVR, ivr)
-	      __DEFINE_SET_CR(TPR, tpr)
-	      __DEFINE_SET_CR(EOI, eoi)
-	      __DEFINE_SET_CR(IRR0, irr0)
-	      __DEFINE_SET_CR(IRR1, irr1)
-	      __DEFINE_SET_CR(IRR2, irr2)
-	      __DEFINE_SET_CR(IRR3, irr3)
-	      __DEFINE_SET_CR(ITV, itv)
-	      __DEFINE_SET_CR(PMV, pmv)
-	      __DEFINE_SET_CR(CMCV, cmcv)
-	      __DEFINE_SET_CR(LRR0, lrr0)
-	      __DEFINE_SET_CR(LRR1, lrr1)
-	);
-#endif
 
 struct pv_cpu_ops pv_cpu_ops = {
 	.fc		= ia64_native_fc_func,
@@ -643,258 +366,4 @@ ia64_native_do_steal_accounting(unsigned long *new_itm)
 
 struct pv_time_ops pv_time_ops = {
 	.do_steal_accounting = ia64_native_do_steal_accounting,
-	.sched_clock = ia64_native_sched_clock,
-};
-
-/***************************************************************************
- * binary pacthing
- * pv_init_ops.patch_bundle
- */
-
-#ifdef ASM_SUPPORTED
-#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg)	\
-	__DEFINE_FUNC(get_ ## name,			\
-		      ";;\n"				\
-		      "mov r8 = " #reg "\n"		\
-		      ";;\n")
-
-#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
-	__DEFINE_FUNC(set_ ## name,			\
-		      ";;\n"				\
-		      "mov " #reg " = r8\n"		\
-		      ";;\n")
-
-#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg)		\
-	IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg);	\
-	IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
-
-#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg)			\
-	IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg)
-
-#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg)			\
-	IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg)
-
-
-IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr);
-IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp);
-
-/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */
-__DEFINE_FUNC(set_psr_l,
-	      ";;\n"
-	      "mov psr.l = r8\n"
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-	      ".serialize.data\n"
-#endif
-	      ";;\n");
-
-IA64_NATIVE_PATCH_DEFINE_REG(gp, gp);
-IA64_NATIVE_PATCH_DEFINE_REG(sp, sp);
-
-IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0);
-IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1);
-IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2);
-IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3);
-IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4);
-IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5);
-IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6);
-IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7);
-
-IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc);
-IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp);
-IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore);
-IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat);
-IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr);
-IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag);
-IA64_NATIVE_PATCH_DEFINE_AR(csd, csd);
-IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd);
-IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27);
-IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr);
-IA64_NATIVE_PATCH_DEFINE_AR(fir, fir);
-IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr);
-IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv);
-IA64_NATIVE_PATCH_DEFINE_AR(unat, unat);
-IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr);
-IA64_NATIVE_PATCH_DEFINE_AR(itc, itc);
-IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs);
-IA64_NATIVE_PATCH_DEFINE_AR(lc, lc);
-IA64_NATIVE_PATCH_DEFINE_AR(ec, ec);
-
-IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr);
-IA64_NATIVE_PATCH_DEFINE_CR(itm, itm);
-IA64_NATIVE_PATCH_DEFINE_CR(iva, iva);
-IA64_NATIVE_PATCH_DEFINE_CR(pta, pta);
-IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr);
-IA64_NATIVE_PATCH_DEFINE_CR(isr, isr);
-IA64_NATIVE_PATCH_DEFINE_CR(iip, iip);
-IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa);
-IA64_NATIVE_PATCH_DEFINE_CR(itir, itir);
-IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa);
-IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs);
-IA64_NATIVE_PATCH_DEFINE_CR(iim, iim);
-IA64_NATIVE_PATCH_DEFINE_CR(iha, iha);
-IA64_NATIVE_PATCH_DEFINE_CR(lid, lid);
-IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr);
-IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr);
-IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi);
-IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0);
-IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1);
-IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2);
-IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3);
-IA64_NATIVE_PATCH_DEFINE_CR(itv, itv);
-IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv);
-IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv);
-IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0);
-IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1);
-
-static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[]
-__initdata_or_module =
-{
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type)		\
-	{							\
-		(void*)ia64_native_ ## name ## _direct_start,	\
-		(void*)ia64_native_ ## name ## _direct_end,	\
-		PARAVIRT_PATCH_TYPE_ ## type,			\
-	}
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore,
-				      INTRIN_LOCAL_IRQ_RESTORE),
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg)			\
-	{								\
-		(void*)ia64_native_get_ ## name ## _direct_start,	\
-		(void*)ia64_native_get_ ## name ## _direct_end,		\
-		PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg,		\
-	}
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)			\
-	{								\
-		(void*)ia64_native_set_ ## name ## _direct_start,	\
-		(void*)ia64_native_set_ ## name ## _direct_end,		\
-		PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg,		\
-	}
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg)		\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg),	\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)		\
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg)		\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg)
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg)		\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg)
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1),
 };
-
-unsigned long __init_or_module
-ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
-{
-	const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) /
-		sizeof(ia64_native_patch_bundle_elems[0]);
-
-	return __paravirt_patch_apply_bundle(sbundle, ebundle, type,
-					      ia64_native_patch_bundle_elems,
-					      nelems, NULL);
-}
-#endif /* ASM_SUPPOTED */
-
-extern const char ia64_native_switch_to[];
-extern const char ia64_native_leave_syscall[];
-extern const char ia64_native_work_processed_syscall[];
-extern const char ia64_native_leave_kernel[];
-
-const struct paravirt_patch_branch_target ia64_native_branch_target[]
-__initconst = {
-#define PARAVIRT_BR_TARGET(name, type)			\
-	{						\
-		ia64_native_ ## name,			\
-		PARAVIRT_PATCH_TYPE_BR_ ## type,	\
-	}
-	PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
-	PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
-	PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
-	PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
-};
-
-static void __init
-ia64_native_patch_branch(unsigned long tag, unsigned long type)
-{
-	const unsigned long nelem =
-		sizeof(ia64_native_branch_target) /
-		sizeof(ia64_native_branch_target[0]);
-	__paravirt_patch_apply_branch(tag, type,
-				      ia64_native_branch_target, nelem);
-}
diff --git a/trunk/arch/ia64/kernel/paravirt_patch.c b/trunk/arch/ia64/kernel/paravirt_patch.c
deleted file mode 100644
index bfdfef1b1ffd..000000000000
--- a/trunk/arch/ia64/kernel/paravirt_patch.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/******************************************************************************
- * linux/arch/ia64/xen/paravirt_patch.c
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <linux/init.h>
-#include <asm/intrinsics.h>
-#include <asm/kprobes.h>
-#include <asm/paravirt.h>
-#include <asm/paravirt_patch.h>
-
-typedef union ia64_inst {
-        struct {
-		unsigned long long qp : 6;
-		unsigned long long : 31;
-		unsigned long long opcode : 4;
-		unsigned long long reserved : 23;
-        } generic;
-        unsigned long long l;
-} ia64_inst_t;
-
-/*
- * flush_icache_range() can't be used here.
- * we are here before cpu_init() which initializes
- * ia64_i_cache_stride_shift. flush_icache_range() uses it.
- */
-void __init_or_module
-paravirt_flush_i_cache_range(const void *instr, unsigned long size)
-{
-	extern void paravirt_fc_i(const void *addr);
-	unsigned long i;
-
-	for (i = 0; i < size; i += sizeof(bundle_t))
-		paravirt_fc_i(instr + i);
-}
-
-bundle_t* __init_or_module
-paravirt_get_bundle(unsigned long tag)
-{
-	return (bundle_t *)(tag & ~3UL);
-}
-
-unsigned long __init_or_module
-paravirt_get_slot(unsigned long tag)
-{
-	return tag & 3UL;
-}
-
-unsigned long __init_or_module
-paravirt_get_num_inst(unsigned long stag, unsigned long etag)
-{
-	bundle_t *sbundle = paravirt_get_bundle(stag);
-	unsigned long sslot = paravirt_get_slot(stag);
-	bundle_t *ebundle = paravirt_get_bundle(etag);
-	unsigned long eslot = paravirt_get_slot(etag);
-
-	return (ebundle - sbundle) * 3 + eslot - sslot + 1;
-}
-
-unsigned long __init_or_module
-paravirt_get_next_tag(unsigned long tag)
-{
-	unsigned long slot = paravirt_get_slot(tag);
-
-	switch (slot) {
-	case 0:
-	case 1:
-		return tag + 1;
-	case 2: {
-		bundle_t *bundle = paravirt_get_bundle(tag);
-		return (unsigned long)(bundle + 1);
-	}
-	default:
-		BUG();
-	}
-	/* NOTREACHED */
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_slot0(const bundle_t *bundle)
-{
-	ia64_inst_t inst;
-	inst.l = bundle->quad0.slot0;
-	return inst;
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_slot1(const bundle_t *bundle)
-{
-	ia64_inst_t inst;
-	inst.l = bundle->quad0.slot1_p0 |
-		((unsigned long long)bundle->quad1.slot1_p1 << 18UL);
-	return inst;
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_slot2(const bundle_t *bundle)
-{
-	ia64_inst_t inst;
-	inst.l = bundle->quad1.slot2;
-	return inst;
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_inst(unsigned long tag)
-{
-	bundle_t *bundle = paravirt_get_bundle(tag);
-	unsigned long slot = paravirt_get_slot(tag);
-
-	switch (slot) {
-	case 0:
-		return paravirt_read_slot0(bundle);
-	case 1:
-		return paravirt_read_slot1(bundle);
-	case 2:
-		return paravirt_read_slot2(bundle);
-	default:
-		BUG();
-	}
-	/* NOTREACHED */
-}
-
-void __init_or_module
-paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst)
-{
-	bundle->quad0.slot0 = inst.l;
-}
-
-void __init_or_module
-paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst)
-{
-	bundle->quad0.slot1_p0 = inst.l;
-	bundle->quad1.slot1_p1 = inst.l >> 18UL;
-}
-
-void __init_or_module
-paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst)
-{
-	bundle->quad1.slot2 = inst.l;
-}
-
-void __init_or_module
-paravirt_write_inst(unsigned long tag, ia64_inst_t inst)
-{
-	bundle_t *bundle = paravirt_get_bundle(tag);
-	unsigned long slot = paravirt_get_slot(tag);
-
-	switch (slot) {
-	case 0:
-		paravirt_write_slot0(bundle, inst);
-		break;
-	case 1:
-		paravirt_write_slot1(bundle, inst);
-		break;
-	case 2:
-		paravirt_write_slot2(bundle, inst);
-		break;
-	default:
-		BUG();
-		break;
-	}
-	paravirt_flush_i_cache_range(bundle, sizeof(*bundle));
-}
-
-/* for debug */
-void
-paravirt_print_bundle(const bundle_t *bundle)
-{
-	const unsigned long *quad = (const unsigned long *)bundle;
-	ia64_inst_t slot0 = paravirt_read_slot0(bundle);
-	ia64_inst_t slot1 = paravirt_read_slot1(bundle);
-	ia64_inst_t slot2 = paravirt_read_slot2(bundle);
-
-	printk(KERN_DEBUG
-	       "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]);
-	printk(KERN_DEBUG
-	       "bundle template 0x%x\n",
-	       bundle->quad0.template);
-	printk(KERN_DEBUG
-	       "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n",
-	       (unsigned long)bundle->quad0.slot0,
-	       (unsigned long)bundle->quad0.slot1_p0,
-	       (unsigned long)bundle->quad1.slot1_p1,
-	       (unsigned long)bundle->quad1.slot2);
-	printk(KERN_DEBUG
-	       "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n",
-	       slot0.l, slot1.l, slot2.l);
-}
-
-static int noreplace_paravirt __init_or_module = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
-{
-	noreplace_paravirt = 1;
-	return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
-
-#ifdef ASM_SUPPORTED
-static void __init_or_module
-fill_nop_bundle(void *sbundle, void *ebundle)
-{
-	extern const char paravirt_nop_bundle[];
-	extern const unsigned long paravirt_nop_bundle_size;
-
-	void *bundle = sbundle;
-
-	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
-	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
-
-	while (bundle < ebundle) {
-		memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size);
-
-		bundle += paravirt_nop_bundle_size;
-	}
-}
-
-/* helper function */
-unsigned long __init_or_module
-__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
-			      const struct paravirt_patch_bundle_elem *elems,
-			      unsigned long nelems,
-			      const struct paravirt_patch_bundle_elem **found)
-{
-	unsigned long used = 0;
-	unsigned long i;
-
-	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
-	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
-
-	found = NULL;
-	for (i = 0; i < nelems; i++) {
-		const struct paravirt_patch_bundle_elem *p = &elems[i];
-		if (p->type == type) {
-			unsigned long need = p->ebundle - p->sbundle;
-			unsigned long room = ebundle - sbundle;
-
-			if (found != NULL)
-				*found = p;
-
-			if (room < need) {
-				/* no room to replace. skip it */
-				printk(KERN_DEBUG
-				       "the space is too small to put "
-				       "bundles. type %ld need %ld room %ld\n",
-				       type, need, room);
-				break;
-			}
-
-			used = need;
-			memcpy(sbundle, p->sbundle, used);
-			break;
-		}
-	}
-
-	return used;
-}
-
-void __init_or_module
-paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
-			    const struct paravirt_patch_site_bundle *end)
-{
-	const struct paravirt_patch_site_bundle *p;
-
-	if (noreplace_paravirt)
-		return;
-	if (pv_init_ops.patch_bundle == NULL)
-		return;
-
-	for (p = start; p < end; p++) {
-		unsigned long used;
-
-		used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle,
-						   p->type);
-		if (used == 0)
-			continue;
-
-		fill_nop_bundle(p->sbundle + used, p->ebundle);
-		paravirt_flush_i_cache_range(p->sbundle,
-					     p->ebundle - p->sbundle);
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-/*
- * nop.i, nop.m, nop.f instruction are same format.
- * but nop.b has differennt format.
- * This doesn't support nop.b for now.
- */
-static void __init_or_module
-fill_nop_inst(unsigned long stag, unsigned long etag)
-{
-	extern const bundle_t paravirt_nop_mfi_inst_bundle[];
-	unsigned long tag;
-	const ia64_inst_t nop_inst =
-		paravirt_read_slot0(paravirt_nop_mfi_inst_bundle);
-
-	for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag))
-		paravirt_write_inst(tag, nop_inst);
-}
-
-void __init_or_module
-paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
-			  const struct paravirt_patch_site_inst *end)
-{
-	const struct paravirt_patch_site_inst *p;
-
-	if (noreplace_paravirt)
-		return;
-	if (pv_init_ops.patch_inst == NULL)
-		return;
-
-	for (p = start; p < end; p++) {
-		unsigned long tag;
-		bundle_t *sbundle;
-		bundle_t *ebundle;
-
-		tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type);
-		if (tag == p->stag)
-			continue;
-
-		fill_nop_inst(tag, p->etag);
-		sbundle = paravirt_get_bundle(p->stag);
-		ebundle = paravirt_get_bundle(p->etag) + 1;
-		paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) *
-					     sizeof(bundle_t));
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-#endif /* ASM_SUPPOTED */
-
-/* brl.cond.sptk.many <target64> X3 */
-typedef union inst_x3_op {
-	ia64_inst_t inst;
-	struct {
-		unsigned long qp: 6;
-		unsigned long btyp: 3;
-		unsigned long unused: 3;
-		unsigned long p: 1;
-		unsigned long imm20b: 20;
-		unsigned long wh: 2;
-		unsigned long d: 1;
-		unsigned long i: 1;
-		unsigned long opcode: 4;
-	};
-	unsigned long l;
-} inst_x3_op_t;
-
-typedef union inst_x3_imm {
-	ia64_inst_t inst;
-	struct {
-		unsigned long unused: 2;
-		unsigned long imm39: 39;
-	};
-	unsigned long l;
-} inst_x3_imm_t;
-
-void __init_or_module
-paravirt_patch_reloc_brl(unsigned long tag, const void *target)
-{
-	unsigned long tag_op = paravirt_get_next_tag(tag);
-	unsigned long tag_imm = tag;
-	bundle_t *bundle = paravirt_get_bundle(tag);
-
-	ia64_inst_t inst_op = paravirt_read_inst(tag_op);
-	ia64_inst_t inst_imm = paravirt_read_inst(tag_imm);
-
-	inst_x3_op_t inst_x3_op = { .l = inst_op.l };
-	inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l };
-
-	unsigned long imm60 =
-		((unsigned long)target - (unsigned long)bundle) >> 4;
-
-	BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */
-	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
-
-	/* imm60[59] 1bit */
-	inst_x3_op.i = (imm60 >> 59) & 1;
-	/* imm60[19:0] 20bit */
-	inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1);
-	/* imm60[58:20] 39bit */
-	inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1);
-
-	inst_op.l = inst_x3_op.l;
-	inst_imm.l = inst_x3_imm.l;
-
-	paravirt_write_inst(tag_op, inst_op);
-	paravirt_write_inst(tag_imm, inst_imm);
-}
-
-/* br.cond.sptk.many <target25>	B1 */
-typedef union inst_b1 {
-	ia64_inst_t inst;
-	struct {
-		unsigned long qp: 6;
-		unsigned long btype: 3;
-		unsigned long unused: 3;
-		unsigned long p: 1;
-		unsigned long imm20b: 20;
-		unsigned long wh: 2;
-		unsigned long d: 1;
-		unsigned long s: 1;
-		unsigned long opcode: 4;
-	};
-	unsigned long l;
-} inst_b1_t;
-
-void __init
-paravirt_patch_reloc_br(unsigned long tag, const void *target)
-{
-	bundle_t *bundle = paravirt_get_bundle(tag);
-	ia64_inst_t inst = paravirt_read_inst(tag);
-	unsigned long target25 = (unsigned long)target - (unsigned long)bundle;
-	inst_b1_t inst_b1;
-
-	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
-
-	inst_b1.l = inst.l;
-	if (target25 & (1UL << 63))
-		inst_b1.s = 1;
-	else
-		inst_b1.s = 0;
-
-	inst_b1.imm20b = target25 >> 4;
-	inst.l = inst_b1.l;
-
-	paravirt_write_inst(tag, inst);
-}
-
-void __init
-__paravirt_patch_apply_branch(
-	unsigned long tag, unsigned long type,
-	const struct paravirt_patch_branch_target *entries,
-	unsigned int nr_entries)
-{
-	unsigned int i;
-	for (i = 0; i < nr_entries; i++) {
-		if (entries[i].type == type) {
-			paravirt_patch_reloc_br(tag, entries[i].entry);
-			break;
-		}
-	}
-}
-
-static void __init
-paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start,
-			    const struct paravirt_patch_site_branch *end)
-{
-	const struct paravirt_patch_site_branch *p;
-
-	if (noreplace_paravirt)
-		return;
-	if (pv_init_ops.patch_branch == NULL)
-		return;
-
-	for (p = start; p < end; p++)
-		(*pv_init_ops.patch_branch)(p->tag, p->type);
-
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-void __init
-paravirt_patch_apply(void)
-{
-	extern const char __start_paravirt_bundles[];
-	extern const char __stop_paravirt_bundles[];
-	extern const char __start_paravirt_insts[];
-	extern const char __stop_paravirt_insts[];
-	extern const char __start_paravirt_branches[];
-	extern const char __stop_paravirt_branches[];
-
-	paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *)
-				    __start_paravirt_bundles,
-				    (const struct paravirt_patch_site_bundle *)
-				    __stop_paravirt_bundles);
-	paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *)
-				  __start_paravirt_insts,
-				  (const struct paravirt_patch_site_inst *)
-				  __stop_paravirt_insts);
-	paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *)
-				    __start_paravirt_branches,
-				    (const struct paravirt_patch_site_branch *)
-				    __stop_paravirt_branches);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "linux"
- * c-basic-offset: 8
- * tab-width: 8
- * indent-tabs-mode: t
- * End:
- */
diff --git a/trunk/arch/ia64/kernel/paravirt_patchlist.c b/trunk/arch/ia64/kernel/paravirt_patchlist.c
deleted file mode 100644
index b28082a95d45..000000000000
--- a/trunk/arch/ia64/kernel/paravirt_patchlist.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <linux/bug.h>
-#include <asm/paravirt.h>
-
-#define DECLARE(name)						\
-	extern unsigned long					\
-		__ia64_native_start_gate_##name##_patchlist[];	\
-	extern unsigned long					\
-		__ia64_native_end_gate_##name##_patchlist[]
-
-DECLARE(fsyscall);
-DECLARE(brl_fsys_bubble_down);
-DECLARE(vtop);
-DECLARE(mckinley_e9);
-
-extern unsigned long __start_gate_section[];
-
-#define ASSIGN(name)							    \
-	.start_##name##_patchlist =					    \
-		(unsigned long)__ia64_native_start_gate_##name##_patchlist, \
-	.end_##name##_patchlist =					    \
-		(unsigned long)__ia64_native_end_gate_##name##_patchlist
-
-struct pv_patchdata pv_patchdata __initdata = {
-	ASSIGN(fsyscall),
-	ASSIGN(brl_fsys_bubble_down),
-	ASSIGN(vtop),
-	ASSIGN(mckinley_e9),
-
-	.gate_section = (void*)__start_gate_section,
-};
-
-
-unsigned long __init
-paravirt_get_gate_patchlist(enum pv_gate_patchlist type)
-{
-
-#define CASE(NAME, name)					\
-	case PV_GATE_START_##NAME:				\
-		return pv_patchdata.start_##name##_patchlist;	\
-	case PV_GATE_END_##NAME:				\
-		return pv_patchdata.end_##name##_patchlist;	\
-
-	switch (type) {
-		CASE(FSYSCALL, fsyscall);
-		CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down);
-		CASE(VTOP, vtop);
-		CASE(MCKINLEY_E9, mckinley_e9);
-	default:
-		BUG();
-		break;
-	}
-	return 0;
-}
-
-void * __init
-paravirt_get_gate_section(void)
-{
-	return pv_patchdata.gate_section;
-}
diff --git a/trunk/arch/ia64/kernel/paravirt_patchlist.h b/trunk/arch/ia64/kernel/paravirt_patchlist.h
deleted file mode 100644
index 0684aa6c6507..000000000000
--- a/trunk/arch/ia64/kernel/paravirt_patchlist.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/******************************************************************************
- * linux/arch/ia64/xen/paravirt_patchlist.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#if defined(__IA64_GATE_PARAVIRTUALIZED_XEN)
-#include <asm/xen/patchlist.h>
-#else
-#include <asm/native/patchlist.h>
-#endif
-
diff --git a/trunk/arch/ia64/kernel/paravirtentry.S b/trunk/arch/ia64/kernel/paravirtentry.S
index 6158560d7f17..2f42fcb9776a 100644
--- a/trunk/arch/ia64/kernel/paravirtentry.S
+++ b/trunk/arch/ia64/kernel/paravirtentry.S
@@ -20,11 +20,8 @@
  *
  */
 
-#include <linux/init.h>
 #include <asm/asmmacro.h>
 #include <asm/asm-offsets.h>
-#include <asm/paravirt_privop.h>
-#include <asm/paravirt_patch.h>
 #include "entry.h"
 
 #define DATA8(sym, init_value)			\
@@ -35,87 +32,29 @@
 	data8 init_value ;			\
 	.popsection
 
-#define BRANCH(targ, reg, breg, type)					\
-	PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ;	\
-	;;								\
-	movl reg=targ ;							\
-	;;								\
-	ld8 reg=[reg] ;							\
-	;;								\
-	mov breg=reg ;							\
+#define BRANCH(targ, reg, breg)		\
+	movl reg=targ ;			\
+	;;				\
+	ld8 reg=[reg] ;			\
+	;;				\
+	mov breg=reg ;			\
 	br.cond.sptk.many breg
 
-#define BRANCH_PROC(sym, reg, breg, type)				\
-	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
-	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
-		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
+#define BRANCH_PROC(sym, reg, breg)				\
+	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
+	GLOBAL_ENTRY(paravirt_ ## sym) ;			\
+		BRANCH(paravirt_ ## sym ## _targ, reg, breg) ;	\
 	END(paravirt_ ## sym)
 
-#define BRANCH_PROC_UNWINFO(sym, reg, breg, type)			\
-	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
-	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
-		PT_REGS_UNWIND_INFO(0) ;				\
-		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
+#define BRANCH_PROC_UNWINFO(sym, reg, breg)			\
+	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
+	GLOBAL_ENTRY(paravirt_ ## sym) ;			\
+		PT_REGS_UNWIND_INFO(0) ;			\
+		BRANCH(paravirt_ ## sym ## _targ, reg, breg) ;	\
 	END(paravirt_ ## sym)
 
 
-BRANCH_PROC(switch_to, r22, b7, SWITCH_TO)
-BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL)
-BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL)
-BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL)
-
-
-#ifdef CONFIG_MODULES
-#define __INIT_OR_MODULE	.text
-#define __INITDATA_OR_MODULE	.data
-#else
-#define __INIT_OR_MODULE	__INIT
-#define __INITDATA_OR_MODULE	__INITDATA
-#endif /* CONFIG_MODULES */
-
-	__INIT_OR_MODULE
-	GLOBAL_ENTRY(paravirt_fc_i)
-	fc.i r32
-	br.ret.sptk.many rp
-	END(paravirt_fc_i)
-	__FINIT
-
-	__INIT_OR_MODULE
-	.align 32
-	GLOBAL_ENTRY(paravirt_nop_b_inst_bundle)
-	{
-		nop.b 0
-		nop.b 0
-		nop.b 0
-	}
-	END(paravirt_nop_b_inst_bundle)
-	__FINIT
-
-	/* NOTE: nop.[mfi] has same format */
-	__INIT_OR_MODULE
-	GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle)
-	{
-		nop.m 0
-		nop.f 0
-		nop.i 0
-	}
-	END(paravirt_nop_mfi_inst_bundle)
-	__FINIT
-
-	__INIT_OR_MODULE
-	GLOBAL_ENTRY(paravirt_nop_bundle)
-paravirt_nop_bundle_start:
-	{
-		nop 0
-		nop 0
-		nop 0
-	}
-paravirt_nop_bundle_end:
-	END(paravirt_nop_bundle)
-	__FINIT
-
-	__INITDATA_OR_MODULE
-	.align 8
-	.global paravirt_nop_bundle_size
-paravirt_nop_bundle_size:
-	data8	paravirt_nop_bundle_end - paravirt_nop_bundle_start
+BRANCH_PROC(switch_to, r22, b7)
+BRANCH_PROC_UNWINFO(leave_syscall, r22, b7)
+BRANCH_PROC(work_processed_syscall, r2, b7)
+BRANCH_PROC_UNWINFO(leave_kernel, r22, b7)
diff --git a/trunk/arch/ia64/kernel/patch.c b/trunk/arch/ia64/kernel/patch.c
index 68a1311db806..b83b2c516008 100644
--- a/trunk/arch/ia64/kernel/patch.c
+++ b/trunk/arch/ia64/kernel/patch.c
@@ -7,7 +7,6 @@
 #include <linux/init.h>
 #include <linux/string.h>
 
-#include <asm/paravirt.h>
 #include <asm/patch.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -170,35 +169,16 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
-extern unsigned long ia64_native_fsyscall_table[NR_syscalls];
-extern char ia64_native_fsys_bubble_down[];
-struct pv_fsys_data pv_fsys_data __initdata = {
-	.fsyscall_table = (unsigned long *)ia64_native_fsyscall_table,
-	.fsys_bubble_down = (void *)ia64_native_fsys_bubble_down,
-};
-
-unsigned long * __init
-paravirt_get_fsyscall_table(void)
-{
-	return pv_fsys_data.fsyscall_table;
-}
-
-char * __init
-paravirt_get_fsys_bubble_down(void)
-{
-	return pv_fsys_data.fsys_bubble_down;
-}
-
 static void __init
 patch_fsyscall_table (unsigned long start, unsigned long end)
 {
-	u64 fsyscall_table = (u64)paravirt_get_fsyscall_table();
+	extern unsigned long fsyscall_table[NR_syscalls];
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
 	while (offp < (s32 *) end) {
 		ip = (u64) ia64_imva((char *) offp + *offp);
-		ia64_patch_imm64(ip, fsyscall_table);
+		ia64_patch_imm64(ip, (u64) fsyscall_table);
 		ia64_fc((void *) ip);
 		++offp;
 	}
@@ -209,7 +189,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end)
 static void __init
 patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 {
-	u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down();
+	extern char fsys_bubble_down[];
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
@@ -227,13 +207,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 void __init
 ia64_patch_gate (void)
 {
-#	define START(name)	paravirt_get_gate_patchlist(PV_GATE_START_##name)
-#	define END(name)	paravirt_get_gate_patchlist(PV_GATE_END_##name)
+#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
+#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
 
-	patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL));
-	patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN));
-	ia64_patch_vtop(START(VTOP), END(VTOP));
-	ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9));
+	patch_fsyscall_table(START(fsyscall), END(fsyscall));
+	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
+	ia64_patch_vtop(START(vtop), END(vtop));
+	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
 }
 
 void ia64_patch_phys_stack_reg(unsigned long val)
@@ -249,7 +229,7 @@ void ia64_patch_phys_stack_reg(unsigned long val)
 	while (offp < end) {
 		ip = (u64) offp + *offp;
 		ia64_patch(ip, mask, imm);
-		ia64_fc((void *)ip);
+		ia64_fc(ip);
 		++offp;
 	}
 	ia64_sync_i();
diff --git a/trunk/arch/ia64/kernel/perfmon.c b/trunk/arch/ia64/kernel/perfmon.c
index 8a06dc480594..5c0f408cfd71 100644
--- a/trunk/arch/ia64/kernel/perfmon.c
+++ b/trunk/arch/ia64/kernel/perfmon.c
@@ -5603,7 +5603,7 @@ pfm_interrupt_handler(int irq, void *arg)
  * /proc/perfmon interface, for debug only
  */
 
-#define PFM_PROC_SHOW_HEADER	((void *)nr_cpu_ids+1)
+#define PFM_PROC_SHOW_HEADER	((void *)NR_CPUS+1)
 
 static void *
 pfm_proc_start(struct seq_file *m, loff_t *pos)
@@ -5612,7 +5612,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos)
 		return PFM_PROC_SHOW_HEADER;
 	}
 
-	while (*pos <= nr_cpu_ids) {
+	while (*pos <= NR_CPUS) {
 		if (cpu_online(*pos - 1)) {
 			return (void *)*pos;
 		}
diff --git a/trunk/arch/ia64/kernel/salinfo.c b/trunk/arch/ia64/kernel/salinfo.c
index 7053c55b7649..ecb9eb78d687 100644
--- a/trunk/arch/ia64/kernel/salinfo.c
+++ b/trunk/arch/ia64/kernel/salinfo.c
@@ -317,7 +317,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t
 	}
 
 	n = data->cpu_check;
-	for (i = 0; i < nr_cpu_ids; i++) {
+	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_isset(n, data->cpu_event)) {
 			if (!cpu_online(n)) {
 				cpu_clear(n, data->cpu_event);
@@ -326,7 +326,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t
 			cpu = n;
 			break;
 		}
-		if (++n == nr_cpu_ids)
+		if (++n == NR_CPUS)
 			n = 0;
 	}
 
@@ -337,7 +337,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t
 
 	/* for next read, start checking at next CPU */
 	data->cpu_check = cpu;
-	if (++data->cpu_check == nr_cpu_ids)
+	if (++data->cpu_check == NR_CPUS)
 		data->cpu_check = 0;
 
 	snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c
index 714066aeda7f..865af27c7737 100644
--- a/trunk/arch/ia64/kernel/setup.c
+++ b/trunk/arch/ia64/kernel/setup.c
@@ -52,7 +52,6 @@
 #include <asm/meminit.h>
 #include <asm/page.h>
 #include <asm/paravirt.h>
-#include <asm/paravirt_patch.h>
 #include <asm/patch.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -538,7 +537,6 @@ setup_arch (char **cmdline_p)
 	paravirt_arch_setup_early();
 
 	ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
-	paravirt_patch_apply();
 
 	*cmdline_p = __va(ia64_boot_param->command_line);
 	strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
@@ -732,10 +730,10 @@ static void *
 c_start (struct seq_file *m, loff_t *pos)
 {
 #ifdef CONFIG_SMP
-	while (*pos < nr_cpu_ids && !cpu_online(*pos))
+	while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map))
 		++*pos;
 #endif
-	return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
+	return *pos < NR_CPUS ? cpu_data(*pos) : NULL;
 }
 
 static void *
@@ -1018,7 +1016,8 @@ cpu_init (void)
 					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
-	BUG_ON(current->mm);
+	if (current->mm)
+		BUG();
 
 	ia64_mmu_init(ia64_imva(cpu_data));
 	ia64_mca_cpu_init(ia64_imva(cpu_data));
diff --git a/trunk/arch/ia64/kernel/smp.c b/trunk/arch/ia64/kernel/smp.c
index 2ea4199d9c57..da8f020d82c1 100644
--- a/trunk/arch/ia64/kernel/smp.c
+++ b/trunk/arch/ia64/kernel/smp.c
@@ -166,11 +166,11 @@ send_IPI_allbutself (int op)
  * Called with preemption disabled.
  */
 static inline void
-send_IPI_mask(const struct cpumask *mask, int op)
+send_IPI_mask(cpumask_t mask, int op)
 {
 	unsigned int cpu;
 
-	for_each_cpu(cpu, mask) {
+	for_each_cpu_mask(cpu, mask) {
 			send_IPI_single(cpu, op);
 	}
 }
@@ -316,7 +316,7 @@ void arch_send_call_function_single_ipi(int cpu)
 	send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
 }
 
-void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+void arch_send_call_function_ipi(cpumask_t mask)
 {
 	send_IPI_mask(mask, IPI_CALL_FUNC);
 }
diff --git a/trunk/arch/ia64/kernel/smpboot.c b/trunk/arch/ia64/kernel/smpboot.c
index 7700e23034bb..52290547c85b 100644
--- a/trunk/arch/ia64/kernel/smpboot.c
+++ b/trunk/arch/ia64/kernel/smpboot.c
@@ -581,14 +581,14 @@ smp_build_cpu_map (void)
 
 	ia64_cpu_to_sapicid[0] = boot_cpu_id;
 	cpus_clear(cpu_present_map);
-	set_cpu_present(0, true);
-	set_cpu_possible(0, true);
+	cpu_set(0, cpu_present_map);
+	cpu_set(0, cpu_possible_map);
 	for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
 		sapicid = smp_boot_data.cpu_phys_id[i];
 		if (sapicid == boot_cpu_id)
 			continue;
-		set_cpu_present(cpu, true);
-		set_cpu_possible(cpu, true);
+		cpu_set(cpu, cpu_present_map);
+		cpu_set(cpu, cpu_possible_map);
 		ia64_cpu_to_sapicid[cpu] = sapicid;
 		cpu++;
 	}
@@ -626,9 +626,12 @@ smp_prepare_cpus (unsigned int max_cpus)
 	 */
 	if (!max_cpus) {
 		printk(KERN_INFO "SMP mode deactivated.\n");
-		init_cpu_online(cpumask_of(0));
-		init_cpu_present(cpumask_of(0));
-		init_cpu_possible(cpumask_of(0));
+		cpus_clear(cpu_online_map);
+		cpus_clear(cpu_present_map);
+		cpus_clear(cpu_possible_map);
+		cpu_set(0, cpu_online_map);
+		cpu_set(0, cpu_present_map);
+		cpu_set(0, cpu_possible_map);
 		return;
 	}
 }
diff --git a/trunk/arch/ia64/kernel/time.c b/trunk/arch/ia64/kernel/time.c
index 641c8b61c4f1..d6747bae52d8 100644
--- a/trunk/arch/ia64/kernel/time.c
+++ b/trunk/arch/ia64/kernel/time.c
@@ -50,15 +50,6 @@ EXPORT_SYMBOL(last_cli_ip);
 
 #endif
 
-#ifdef CONFIG_PARAVIRT
-/* We need to define a real function for sched_clock, to override the
-   weak default version */
-unsigned long long sched_clock(void)
-{
-        return paravirt_sched_clock();
-}
-#endif
-
 #ifdef CONFIG_PARAVIRT
 static void
 paravirt_clocksource_resume(void)
diff --git a/trunk/arch/ia64/kernel/vmlinux.lds.S b/trunk/arch/ia64/kernel/vmlinux.lds.S
index 4a95e86b9ac2..3765efc5f963 100644
--- a/trunk/arch/ia64/kernel/vmlinux.lds.S
+++ b/trunk/arch/ia64/kernel/vmlinux.lds.S
@@ -169,30 +169,6 @@ SECTIONS
 	  __end___mckinley_e9_bundles = .;
 	}
 
-#if defined(CONFIG_PARAVIRT)
-  . = ALIGN(16);
-  .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET)
-	{
-	  __start_paravirt_bundles = .;
-          *(.paravirt_bundles)
-	  __stop_paravirt_bundles = .;
-	}
-  . = ALIGN(16);
-  .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET)
-	{
-	  __start_paravirt_insts = .;
-          *(.paravirt_insts)
-	  __stop_paravirt_insts = .;
-	}
-  . = ALIGN(16);
-  .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET)
-	{
-	  __start_paravirt_branches = .;
-	  *(.paravirt_branches)
-	  __stop_paravirt_branches = .;
-	}
-#endif
-
 #if defined(CONFIG_IA64_GENERIC)
   /* Machine Vector */
   . = ALIGN(16);
@@ -225,12 +201,6 @@ SECTIONS
 	  __start_gate_section = .;
 	  *(.data.gate)
 	  __stop_gate_section = .;
-#ifdef CONFIG_XEN
-	  . = ALIGN(PAGE_SIZE);
-	  __xen_start_gate_section = .;
-	  *(.data.gate.xen)
-	  __xen_stop_gate_section = .;
-#endif
 	}
   . = ALIGN(PAGE_SIZE);		/* make sure the gate page doesn't expose
   				 * kernel data
diff --git a/trunk/arch/ia64/kvm/kvm-ia64.c b/trunk/arch/ia64/kvm/kvm-ia64.c
index 28af6a731bb8..076b00d1dbff 100644
--- a/trunk/arch/ia64/kvm/kvm-ia64.c
+++ b/trunk/arch/ia64/kvm/kvm-ia64.c
@@ -70,7 +70,7 @@ static void kvm_flush_icache(unsigned long start, unsigned long len)
 	int l;
 
 	for (l = 0; l < (len + 32); l += 32)
-		ia64_fc((void *)(start + l));
+		ia64_fc(start + l);
 
 	ia64_sync_i();
 	ia64_srlz_i();
diff --git a/trunk/arch/ia64/kvm/vcpu.c b/trunk/arch/ia64/kvm/vcpu.c
index a18ee17b9192..d4d280505878 100644
--- a/trunk/arch/ia64/kvm/vcpu.c
+++ b/trunk/arch/ia64/kvm/vcpu.c
@@ -386,7 +386,7 @@ void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1,
 		else
 			*rnat_addr = (*rnat_addr) & (~nat_mask);
 
-		ia64_setreg(_IA64_REG_AR_BSPSTORE, (unsigned long)bspstore);
+		ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore);
 		ia64_setreg(_IA64_REG_AR_RNAT, rnat);
 	}
 	local_irq_restore(psr);
diff --git a/trunk/arch/ia64/kvm/vtlb.c b/trunk/arch/ia64/kvm/vtlb.c
index 2c2501f13159..38232b37668b 100644
--- a/trunk/arch/ia64/kvm/vtlb.c
+++ b/trunk/arch/ia64/kvm/vtlb.c
@@ -210,7 +210,6 @@ void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type)
 		phy_pte  &= ~PAGE_FLAGS_RV_MASK;
 		psr = ia64_clear_ic();
 		ia64_itc(type, va, phy_pte, itir_ps(itir));
-		paravirt_dv_serialize_data();
 		ia64_set_psr(psr);
 	}
 
@@ -457,7 +456,6 @@ void  thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 		phy_pte  &= ~PAGE_FLAGS_RV_MASK;
 		psr = ia64_clear_ic();
 		ia64_itc(type, ifa, phy_pte, ps);
-		paravirt_dv_serialize_data();
 		ia64_set_psr(psr);
 	}
 	if (!(pte&VTLB_PTE_IO))
diff --git a/trunk/arch/ia64/mm/init.c b/trunk/arch/ia64/mm/init.c
index c0f3bee69042..56e12903973c 100644
--- a/trunk/arch/ia64/mm/init.c
+++ b/trunk/arch/ia64/mm/init.c
@@ -35,7 +35,6 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/mca.h>
-#include <asm/paravirt.h>
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
@@ -260,7 +259,6 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 static void __init
 setup_gate (void)
 {
-	void *gate_section;
 	struct page *page;
 
 	/*
@@ -268,11 +266,10 @@ setup_gate (void)
 	 * headers etc. and once execute-only page to enable
 	 * privilege-promotion via "epc":
 	 */
-	gate_section = paravirt_get_gate_section();
-	page = virt_to_page(ia64_imva(gate_section));
+	page = virt_to_page(ia64_imva(__start_gate_section));
 	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
 #ifdef HAVE_BUGGY_SEGREL
-	page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE));
+	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
 	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
 #else
 	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
@@ -636,7 +633,8 @@ mem_init (void)
 #endif
 
 #ifdef CONFIG_FLATMEM
-	BUG_ON(!mem_map);
+	if (!mem_map)
+		BUG();
 	max_mapnr = max_low_pfn;
 #endif
 
@@ -669,8 +667,8 @@ mem_init (void)
 	 * code can tell them apart.
 	 */
 	for (i = 0; i < NR_syscalls; ++i) {
+		extern unsigned long fsyscall_table[NR_syscalls];
 		extern unsigned long sys_call_table[NR_syscalls];
-		unsigned long *fsyscall_table = paravirt_get_fsyscall_table();
 
 		if (!fsyscall_table[i] || nolwsys)
 			fsyscall_table[i] = sys_call_table[i] | 1;
diff --git a/trunk/arch/ia64/mm/tlb.c b/trunk/arch/ia64/mm/tlb.c
index b9f3d7bbb338..bd9818a36b47 100644
--- a/trunk/arch/ia64/mm/tlb.c
+++ b/trunk/arch/ia64/mm/tlb.c
@@ -309,7 +309,7 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
 
 	preempt_disable();
 #ifdef CONFIG_SMP
-	if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
+	if (mm != current->active_mm || cpus_weight(mm->cpu_vm_mask) != 1) {
 		platform_global_tlb_purge(mm, start, end, nbits);
 		preempt_enable();
 		return;
diff --git a/trunk/arch/ia64/scripts/pvcheck.sed b/trunk/arch/ia64/scripts/pvcheck.sed
index e59809a3fc01..ba66ac2e4c60 100644
--- a/trunk/arch/ia64/scripts/pvcheck.sed
+++ b/trunk/arch/ia64/scripts/pvcheck.sed
@@ -17,7 +17,6 @@ s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g
 s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g
 s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g	# avoid ar.fpsr
 s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g
-s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g
 s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g
 s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g
 s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g
diff --git a/trunk/arch/ia64/sn/kernel/io_common.c b/trunk/arch/ia64/sn/kernel/io_common.c
index 57f280dd9def..0d4ffa4da1da 100644
--- a/trunk/arch/ia64/sn/kernel/io_common.c
+++ b/trunk/arch/ia64/sn/kernel/io_common.c
@@ -135,7 +135,8 @@ static s64 sn_device_fixup_war(u64 nasid, u64 widget, int device,
 	}
 
 	war_list = kzalloc(DEV_PER_WIDGET * sizeof(*war_list), GFP_KERNEL);
-	BUG_ON(!war_list);
+	if (!war_list)
+		BUG();
 
 	SAL_CALL_NOLOCK(isrv, SN_SAL_IOIF_GET_WIDGET_DMAFLUSH_LIST,
 			nasid, widget, __pa(war_list), 0, 0, 0 ,0);
@@ -179,20 +180,23 @@ sn_common_hubdev_init(struct hubdev_info *hubdev)
 		sizeof(struct sn_flush_device_kernel *);
 	hubdev->hdi_flush_nasid_list.widget_p =
 		kzalloc(size, GFP_KERNEL);
-	BUG_ON(!hubdev->hdi_flush_nasid_list.widget_p);
+	if (!hubdev->hdi_flush_nasid_list.widget_p)
+		BUG();
 
 	for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++) {
 		size = DEV_PER_WIDGET *
 			sizeof(struct sn_flush_device_kernel);
 		sn_flush_device_kernel = kzalloc(size, GFP_KERNEL);
-		BUG_ON(!sn_flush_device_kernel);
+		if (!sn_flush_device_kernel)
+			BUG();
 
 		dev_entry = sn_flush_device_kernel;
 		for (device = 0; device < DEV_PER_WIDGET;
 		     device++, dev_entry++) {
 			size = sizeof(struct sn_flush_device_common);
 			dev_entry->common = kzalloc(size, GFP_KERNEL);
-			BUG_ON(!dev_entry->common);
+			if (!dev_entry->common)
+				BUG();
 			if (sn_prom_feature_available(PRF_DEVICE_FLUSH_LIST))
 				status = sal_get_device_dmaflush_list(
 					     hubdev->hdi_nasid, widget, device,
@@ -322,7 +326,8 @@ sn_common_bus_fixup(struct pci_bus *bus,
 	 */
 	controller->platform_data = kzalloc(sizeof(struct sn_platform_data),
 					    GFP_KERNEL);
-	BUG_ON(controller->platform_data == NULL);
+	if (controller->platform_data == NULL)
+		BUG();
 	sn_platform_data =
 			(struct sn_platform_data *) controller->platform_data;
 	sn_platform_data->provider_soft = provider_soft;
diff --git a/trunk/arch/ia64/sn/kernel/io_init.c b/trunk/arch/ia64/sn/kernel/io_init.c
index ee774c366a06..e2eb2da60f96 100644
--- a/trunk/arch/ia64/sn/kernel/io_init.c
+++ b/trunk/arch/ia64/sn/kernel/io_init.c
@@ -128,7 +128,8 @@ sn_legacy_pci_window_fixup(struct pci_controller *controller,
 {
 		controller->window = kcalloc(2, sizeof(struct pci_window),
 					     GFP_KERNEL);
-		BUG_ON(controller->window == NULL);
+		if (controller->window == NULL)
+			BUG();
 		controller->window[0].offset = legacy_io;
 		controller->window[0].resource.name = "legacy_io";
 		controller->window[0].resource.flags = IORESOURCE_IO;
@@ -167,7 +168,8 @@ sn_pci_window_fixup(struct pci_dev *dev, unsigned int count,
 	idx = controller->windows;
 	new_count = controller->windows + count;
 	new_window = kcalloc(new_count, sizeof(struct pci_window), GFP_KERNEL);
-	BUG_ON(new_window == NULL);
+	if (new_window == NULL)
+		BUG();
 	if (controller->window) {
 		memcpy(new_window, controller->window,
 		       sizeof(struct pci_window) * controller->windows);
@@ -220,7 +222,8 @@ sn_io_slot_fixup(struct pci_dev *dev)
 		(u64) __pa(pcidev_info),
 		(u64) __pa(sn_irq_info));
 
-	BUG_ON(status); /* Cannot get platform pci device information */
+	if (status)
+		BUG(); /* Cannot get platform pci device information */
 
 
 	/* Copy over PIO Mapped Addresses */
@@ -304,7 +307,8 @@ sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
 	prom_bussoft_ptr = __va(prom_bussoft_ptr);
 
 	controller = kzalloc(sizeof(*controller), GFP_KERNEL);
-	BUG_ON(!controller);
+	if (!controller)
+		BUG();
 	controller->segment = segment;
 
 	/*
diff --git a/trunk/arch/ia64/sn/kernel/setup.c b/trunk/arch/ia64/sn/kernel/setup.c
index e456f062f241..02c5b8a9fb60 100644
--- a/trunk/arch/ia64/sn/kernel/setup.c
+++ b/trunk/arch/ia64/sn/kernel/setup.c
@@ -732,7 +732,8 @@ void __init build_cnode_tables(void)
 		kl_config_hdr_t *klgraph_header;
 		nasid = cnodeid_to_nasid(node);
 		klgraph_header = ia64_sn_get_klconfig_addr(nasid);
-		BUG_ON(klgraph_header == NULL);
+		if (klgraph_header == NULL)
+			BUG();
 		brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info);
 		while (brd) {
 			if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) {
@@ -749,7 +750,7 @@ nasid_slice_to_cpuid(int nasid, int slice)
 {
 	long cpu;
 
-	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		if (cpuid_to_nasid(cpu) == nasid &&
 					cpuid_to_slice(cpu) == slice)
 			return cpu;
diff --git a/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c b/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 1176506b2bae..e585f9a2afb9 100644
--- a/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -133,7 +133,7 @@ sn2_ipi_flush_all_tlb(struct mm_struct *mm)
 	unsigned long itc;
 
 	itc = ia64_get_itc();
-	smp_flush_tlb_cpumask(*mm_cpumask(mm));
+	smp_flush_tlb_cpumask(mm->cpu_vm_mask);
 	itc = ia64_get_itc() - itc;
 	__get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc;
 	__get_cpu_var(ptcstats).shub_ipi_flushes++;
@@ -182,7 +182,7 @@ sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start,
 	nodes_clear(nodes_flushed);
 	i = 0;
 
-	for_each_cpu(cpu, mm_cpumask(mm)) {
+	for_each_cpu_mask(cpu, mm->cpu_vm_mask) {
 		cnode = cpu_to_node(cpu);
 		node_set(cnode, nodes_flushed);
 		lcpu = cpu;
@@ -461,7 +461,7 @@ bool sn_cpu_disable_allowed(int cpu)
 
 static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
 {
-	if (*offset < nr_cpu_ids)
+	if (*offset < NR_CPUS)
 		return offset;
 	return NULL;
 }
@@ -469,7 +469,7 @@ static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
 static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
 {
 	(*offset)++;
-	if (*offset < nr_cpu_ids)
+	if (*offset < NR_CPUS)
 		return offset;
 	return NULL;
 }
@@ -491,7 +491,7 @@ static int sn2_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt);
 	}
 
-	if (cpu < nr_cpu_ids && cpu_online(cpu)) {
+	if (cpu < NR_CPUS && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
 		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
 				stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
@@ -554,7 +554,7 @@ static int __init sn2_ptc_init(void)
 
 	proc_sn2_ptc = proc_create(PTC_BASENAME, 0444,
 				   NULL, &proc_sn2_ptc_operations);
-	if (!proc_sn2_ptc) {
+	if (!&proc_sn2_ptc_operations) {
 		printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
 		return -EINVAL;
 	}
diff --git a/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 9e6491cf72bd..be339477f906 100644
--- a/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -275,7 +275,8 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
 
 	/* get it's interconnect topology */
 	sz = op->ports * sizeof(struct sn_hwperf_port_info);
-	BUG_ON(sz > sizeof(ptdata));
+	if (sz > sizeof(ptdata))
+		BUG();
 	e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
 			      SN_HWPERF_ENUM_PORTS, nodeobj->id, sz,
 			      (u64)&ptdata, 0, 0, NULL);
@@ -309,7 +310,8 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
 	if (router && (!found_cpu || !found_mem)) {
 		/* search for a node connected to the same router */
 		sz = router->ports * sizeof(struct sn_hwperf_port_info);
-		BUG_ON(sz > sizeof(ptdata));
+		if (sz > sizeof(ptdata))
+			BUG();
 		e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
 				      SN_HWPERF_ENUM_PORTS, router->id, sz,
 				      (u64)&ptdata, 0, 0, NULL);
@@ -610,7 +612,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 	op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK;
 
 	if (cpu != SN_HWPERF_ARG_ANY_CPU) {
-		if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		if (cpu >= NR_CPUS || !cpu_online(cpu)) {
 			r = -EINVAL;
 			goto out;
 		}
diff --git a/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index c659ad5613a0..060df4aa9916 100644
--- a/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -256,7 +256,9 @@ void sn_dma_flush(u64 addr)
 
 	hubinfo = (NODEPDA(nasid_to_cnodeid(nasid)))->pdinfo;
 
-	BUG_ON(!hubinfo);
+	if (!hubinfo) {
+		BUG();
+	}
 
 	flush_nasid_list = &hubinfo->hdi_flush_nasid_list;
 	if (flush_nasid_list->widget_p == NULL)
diff --git a/trunk/arch/ia64/xen/Makefile b/trunk/arch/ia64/xen/Makefile
index e6f4a0a74228..0ad0224693d9 100644
--- a/trunk/arch/ia64/xen/Makefile
+++ b/trunk/arch/ia64/xen/Makefile
@@ -3,29 +3,14 @@
 #
 
 obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \
-	 hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o \
-	 gate-data.o
+	 hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o
 
 obj-$(CONFIG_IA64_GENERIC) += machvec.o
 
-# The gate DSO image is built using a special linker script.
-include $(srctree)/arch/ia64/kernel/Makefile.gate
-
-# tell compiled for xen
-CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN
-AFLAGS_gate.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN -D__IA64_GATE_PARAVIRTUALIZED_XEN
-
-# use same file of native.
-$(obj)/gate.o: $(src)/../kernel/gate.S FORCE
-	$(call if_changed_dep,as_o_S)
-$(obj)/gate.lds: $(src)/../kernel/gate.lds.S FORCE
-	$(call if_changed_dep,cpp_lds_S)
-
-
 AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN
 
 # xen multi compile
-ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S fsys.S
+ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S
 ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o))
 obj-y += $(ASM_PARAVIRT_OBJS)
 define paravirtualized_xen
diff --git a/trunk/arch/ia64/xen/gate-data.S b/trunk/arch/ia64/xen/gate-data.S
deleted file mode 100644
index 7d4830afc91d..000000000000
--- a/trunk/arch/ia64/xen/gate-data.S
+++ /dev/null
@@ -1,3 +0,0 @@
-	.section .data.gate.xen, "aw"
-
-	.incbin "arch/ia64/xen/gate.so"
diff --git a/trunk/arch/ia64/xen/hypercall.S b/trunk/arch/ia64/xen/hypercall.S
index e32dae444dd6..45e02bb64a92 100644
--- a/trunk/arch/ia64/xen/hypercall.S
+++ b/trunk/arch/ia64/xen/hypercall.S
@@ -9,7 +9,6 @@
 #include <asm/intrinsics.h>
 #include <asm/xen/privop.h>
 
-#ifdef __INTEL_COMPILER
 /*
  * Hypercalls without parameter.
  */
@@ -73,7 +72,6 @@ GLOBAL_ENTRY(xen_set_rr0_to_rr4)
 	br.ret.sptk.many rp
 	;;
 END(xen_set_rr0_to_rr4)
-#endif
 
 GLOBAL_ENTRY(xen_send_ipi)
 	mov r14=r32
diff --git a/trunk/arch/ia64/xen/time.c b/trunk/arch/ia64/xen/time.c
index fb8332690179..68d6204c3f16 100644
--- a/trunk/arch/ia64/xen/time.c
+++ b/trunk/arch/ia64/xen/time.c
@@ -175,58 +175,10 @@ static void xen_itc_jitter_data_reset(void)
 	} while (unlikely(ret != lcycle));
 }
 
-/* based on xen_sched_clock() in arch/x86/xen/time.c. */
-/*
- * This relies on HAVE_UNSTABLE_SCHED_CLOCK. If it can't be defined,
- * something similar logic should be implemented here.
- */
-/*
- * Xen sched_clock implementation.  Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-static unsigned long long xen_sched_clock(void)
-{
-	struct vcpu_runstate_info runstate;
-
-	unsigned long long now;
-	unsigned long long offset;
-	unsigned long long ret;
-
-	/*
-	 * Ideally sched_clock should be called on a per-cpu basis
-	 * anyway, so preempt should already be disabled, but that's
-	 * not current practice at the moment.
-	 */
-	preempt_disable();
-
-	/*
-	 * both ia64_native_sched_clock() and xen's runstate are
-	 * based on mAR.ITC. So difference of them makes sense.
-	 */
-	now = ia64_native_sched_clock();
-
-	get_runstate_snapshot(&runstate);
-
-	WARN_ON(runstate.state != RUNSTATE_running);
-
-	offset = 0;
-	if (now > runstate.state_entry_time)
-		offset = now - runstate.state_entry_time;
-	ret = runstate.time[RUNSTATE_blocked] +
-		runstate.time[RUNSTATE_running] +
-		offset;
-
-	preempt_enable();
-
-	return ret;
-}
-
 struct pv_time_ops xen_time_ops __initdata = {
 	.init_missing_ticks_accounting	= xen_init_missing_ticks_accounting,
 	.do_steal_accounting		= xen_do_steal_accounting,
 	.clocksource_resume		= xen_itc_jitter_data_reset,
-	.sched_clock			= xen_sched_clock,
 };
 
 /* Called after suspend, to resume time.  */
diff --git a/trunk/arch/ia64/xen/xen_pv_ops.c b/trunk/arch/ia64/xen/xen_pv_ops.c
index 5e2270a999fa..936cff3c96e0 100644
--- a/trunk/arch/ia64/xen/xen_pv_ops.c
+++ b/trunk/arch/ia64/xen/xen_pv_ops.c
@@ -24,7 +24,6 @@
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/pm.h>
-#include <linux/unistd.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/xencomm.h>
@@ -154,13 +153,6 @@ xen_post_smp_prepare_boot_cpu(void)
 	xen_setup_vcpu_info_placement();
 }
 
-#ifdef ASM_SUPPORTED
-static unsigned long __init_or_module
-xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
-#endif
-static void __init
-xen_patch_branch(unsigned long tag, unsigned long type);
-
 static const struct pv_init_ops xen_init_ops __initconst = {
 	.banner = xen_banner,
 
@@ -171,53 +163,6 @@ static const struct pv_init_ops xen_init_ops __initconst = {
 	.arch_setup_nomca = xen_arch_setup_nomca,
 
 	.post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu,
-#ifdef ASM_SUPPORTED
-	.patch_bundle = xen_patch_bundle,
-#endif
-	.patch_branch = xen_patch_branch,
-};
-
-/***************************************************************************
- * pv_fsys_data
- * addresses for fsys
- */
-
-extern unsigned long xen_fsyscall_table[NR_syscalls];
-extern char xen_fsys_bubble_down[];
-struct pv_fsys_data xen_fsys_data __initdata = {
-	.fsyscall_table = (unsigned long *)xen_fsyscall_table,
-	.fsys_bubble_down = (void *)xen_fsys_bubble_down,
-};
-
-/***************************************************************************
- * pv_patchdata
- * patchdata addresses
- */
-
-#define DECLARE(name)							\
-	extern unsigned long __xen_start_gate_##name##_patchlist[];	\
-	extern unsigned long __xen_end_gate_##name##_patchlist[]
-
-DECLARE(fsyscall);
-DECLARE(brl_fsys_bubble_down);
-DECLARE(vtop);
-DECLARE(mckinley_e9);
-
-extern unsigned long __xen_start_gate_section[];
-
-#define ASSIGN(name)							\
-	.start_##name##_patchlist =					\
-		(unsigned long)__xen_start_gate_##name##_patchlist,	\
-	.end_##name##_patchlist =					\
-		(unsigned long)__xen_end_gate_##name##_patchlist
-
-static struct pv_patchdata xen_patchdata __initdata = {
-	ASSIGN(fsyscall),
-	ASSIGN(brl_fsys_bubble_down),
-	ASSIGN(vtop),
-	ASSIGN(mckinley_e9),
-
-	.gate_section = (void*)__xen_start_gate_section,
 };
 
 /***************************************************************************
@@ -225,76 +170,6 @@ static struct pv_patchdata xen_patchdata __initdata = {
  * intrinsics hooks.
  */
 
-#ifndef ASM_SUPPORTED
-static void
-xen_set_itm_with_offset(unsigned long val)
-{
-	/* ia64_cpu_local_tick() calls this with interrupt enabled. */
-	/* WARN_ON(!irqs_disabled()); */
-	xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
-}
-
-static unsigned long
-xen_get_itm_with_offset(void)
-{
-	/* unused at this moment */
-	printk(KERN_DEBUG "%s is called.\n", __func__);
-
-	WARN_ON(!irqs_disabled());
-	return ia64_native_getreg(_IA64_REG_CR_ITM) +
-		XEN_MAPPEDREGS->itc_offset;
-}
-
-/* ia64_set_itc() is only called by
- * cpu_init() with ia64_set_itc(0) and ia64_sync_itc().
- * So XEN_MAPPEDRESG->itc_offset cal be considered as almost constant.
- */
-static void
-xen_set_itc(unsigned long val)
-{
-	unsigned long mitc;
-
-	WARN_ON(!irqs_disabled());
-	mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
-	XEN_MAPPEDREGS->itc_offset = val - mitc;
-	XEN_MAPPEDREGS->itc_last = val;
-}
-
-static unsigned long
-xen_get_itc(void)
-{
-	unsigned long res;
-	unsigned long itc_offset;
-	unsigned long itc_last;
-	unsigned long ret_itc_last;
-
-	itc_offset = XEN_MAPPEDREGS->itc_offset;
-	do {
-		itc_last = XEN_MAPPEDREGS->itc_last;
-		res = ia64_native_getreg(_IA64_REG_AR_ITC);
-		res += itc_offset;
-		if (itc_last >= res)
-			res = itc_last + 1;
-		ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
-				       itc_last, res);
-	} while (unlikely(ret_itc_last != itc_last));
-	return res;
-
-#if 0
-	/* ia64_itc_udelay() calls ia64_get_itc() with interrupt enabled.
-	   Should it be paravirtualized instead? */
-	WARN_ON(!irqs_disabled());
-	itc_offset = XEN_MAPPEDREGS->itc_offset;
-	itc_last = XEN_MAPPEDREGS->itc_last;
-	res = ia64_native_getreg(_IA64_REG_AR_ITC);
-	res += itc_offset;
-	if (itc_last >= res)
-		res = itc_last + 1;
-	XEN_MAPPEDREGS->itc_last = res;
-	return res;
-#endif
-}
-
 static void xen_setreg(int regnum, unsigned long val)
 {
 	switch (regnum) {
@@ -306,14 +181,11 @@ static void xen_setreg(int regnum, unsigned long val)
 		xen_set_eflag(val);
 		break;
 #endif
-	case _IA64_REG_AR_ITC:
-		xen_set_itc(val);
-		break;
 	case _IA64_REG_CR_TPR:
 		xen_set_tpr(val);
 		break;
 	case _IA64_REG_CR_ITM:
-		xen_set_itm_with_offset(val);
+		xen_set_itm(val);
 		break;
 	case _IA64_REG_CR_EOI:
 		xen_eoi(val);
@@ -337,12 +209,6 @@ static unsigned long xen_getreg(int regnum)
 		res = xen_get_eflag();
 		break;
 #endif
-	case _IA64_REG_AR_ITC:
-		res = xen_get_itc();
-		break;
-	case _IA64_REG_CR_ITM:
-		res = xen_get_itm_with_offset();
-		break;
 	case _IA64_REG_CR_IVR:
 		res = xen_get_ivr();
 		break;
@@ -393,417 +259,8 @@ xen_intrin_local_irq_restore(unsigned long mask)
 	else
 		xen_rsm_i();
 }
-#else
-#define __DEFINE_FUNC(name, code)					\
-	extern const char xen_ ## name ## _direct_start[];		\
-	extern const char xen_ ## name ## _direct_end[];		\
-	asm (".align 32\n"						\
-	     ".proc xen_" #name "\n"					\
-	     "xen_" #name ":\n"						\
-	     "xen_" #name "_direct_start:\n"				\
-	     code							\
-	     "xen_" #name "_direct_end:\n"				\
-	     "br.cond.sptk.many b6\n"					\
-	     ".endp xen_" #name "\n")
-
-#define DEFINE_VOID_FUNC0(name, code)		\
-	extern void				\
-	xen_ ## name (void);			\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC1(name, code)		\
-	extern void				\
-	xen_ ## name (unsigned long arg);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC1_VOID(name, code)	\
-	extern void				\
-	xen_ ## name (void *arg);		\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC2(name, code)		\
-	extern void				\
-	xen_ ## name (unsigned long arg0,	\
-		      unsigned long arg1);	\
-	__DEFINE_FUNC(name, code)
 
-#define DEFINE_FUNC0(name, code)		\
-	extern unsigned long			\
-	xen_ ## name (void);			\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_FUNC1(name, type, code)		\
-	extern unsigned long			\
-	xen_ ## name (type arg);		\
-	__DEFINE_FUNC(name, code)
-
-#define XEN_PSR_I_ADDR_ADDR     (XSI_BASE + XSI_PSR_I_ADDR_OFS)
-
-/*
- * static void xen_set_itm_with_offset(unsigned long val)
- *        xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
- */
-/* 2 bundles */
-DEFINE_VOID_FUNC1(set_itm_with_offset,
-		  "mov r2 = " __stringify(XSI_BASE) " + "
-		  __stringify(XSI_ITC_OFFSET_OFS) "\n"
-		  ";;\n"
-		  "ld8 r3 = [r2]\n"
-		  ";;\n"
-		  "sub r8 = r8, r3\n"
-		  "break " __stringify(HYPERPRIVOP_SET_ITM) "\n");
-
-/*
- * static unsigned long xen_get_itm_with_offset(void)
- *    return ia64_native_getreg(_IA64_REG_CR_ITM) + XEN_MAPPEDREGS->itc_offset;
- */
-/* 2 bundles */
-DEFINE_FUNC0(get_itm_with_offset,
-	     "mov r2 = " __stringify(XSI_BASE) " + "
-	     __stringify(XSI_ITC_OFFSET_OFS) "\n"
-	     ";;\n"
-	     "ld8 r3 = [r2]\n"
-	     "mov r8 = cr.itm\n"
-	     ";;\n"
-	     "add r8 = r8, r2\n");
-
-/*
- * static void xen_set_itc(unsigned long val)
- *	unsigned long mitc;
- *
- *	WARN_ON(!irqs_disabled());
- *	mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
- *	XEN_MAPPEDREGS->itc_offset = val - mitc;
- *	XEN_MAPPEDREGS->itc_last = val;
- */
-/* 2 bundles */
-DEFINE_VOID_FUNC1(set_itc,
-		  "mov r2 = " __stringify(XSI_BASE) " + "
-		  __stringify(XSI_ITC_LAST_OFS) "\n"
-		  "mov r3 = ar.itc\n"
-		  ";;\n"
-		  "sub r3 = r8, r3\n"
-		  "st8 [r2] = r8, "
-		  __stringify(XSI_ITC_LAST_OFS) " - "
-		  __stringify(XSI_ITC_OFFSET_OFS) "\n"
-		  ";;\n"
-		  "st8 [r2] = r3\n");
-
-/*
- * static unsigned long xen_get_itc(void)
- *	unsigned long res;
- *	unsigned long itc_offset;
- *	unsigned long itc_last;
- *	unsigned long ret_itc_last;
- *
- *	itc_offset = XEN_MAPPEDREGS->itc_offset;
- *	do {
- *		itc_last = XEN_MAPPEDREGS->itc_last;
- *		res = ia64_native_getreg(_IA64_REG_AR_ITC);
- *		res += itc_offset;
- *		if (itc_last >= res)
- *			res = itc_last + 1;
- *		ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
- *				       itc_last, res);
- *	} while (unlikely(ret_itc_last != itc_last));
- *	return res;
- */
-/* 5 bundles */
-DEFINE_FUNC0(get_itc,
-	     "mov r2 = " __stringify(XSI_BASE) " + "
-	     __stringify(XSI_ITC_OFFSET_OFS) "\n"
-	     ";;\n"
-	     "ld8 r9 = [r2], " __stringify(XSI_ITC_LAST_OFS) " - "
-	     __stringify(XSI_ITC_OFFSET_OFS) "\n"
-					/* r9 = itc_offset */
-					/* r2 = XSI_ITC_OFFSET */
-	     "888:\n"
-	     "mov r8 = ar.itc\n"	/* res = ar.itc */
-	     ";;\n"
-	     "ld8 r3 = [r2]\n"		/* r3 = itc_last */
-	     "add r8 = r8, r9\n"	/* res = ar.itc + itc_offset */
-	     ";;\n"
-	     "cmp.gtu p6, p0 = r3, r8\n"
-	     ";;\n"
-	     "(p6) add r8 = 1, r3\n"	/* if (itc_last > res) itc_last + 1 */
-	     ";;\n"
-	     "mov ar.ccv = r8\n"
-	     ";;\n"
-	     "cmpxchg8.acq r10 = [r2], r8, ar.ccv\n"
-	     ";;\n"
-	     "cmp.ne p6, p0 = r10, r3\n"
-	     "(p6) hint @pause\n"
-	     "(p6) br.cond.spnt 888b\n");
-
-DEFINE_VOID_FUNC1_VOID(fc,
-		       "break " __stringify(HYPERPRIVOP_FC) "\n");
-
-/*
- * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR
- * masked_addr = *psr_i_addr_addr
- * pending_intr_addr = masked_addr - 1
- * if (val & IA64_PSR_I) {
- *   masked = *masked_addr
- *   *masked_addr = 0:xen_set_virtual_psr_i(1)
- *   compiler barrier
- *   if (masked) {
- *      uint8_t pending = *pending_intr_addr;
- *      if (pending)
- *              XEN_HYPER_SSM_I
- *   }
- * } else {
- *   *masked_addr = 1:xen_set_virtual_psr_i(0)
- * }
- */
-/* 6 bundles */
-DEFINE_VOID_FUNC1(intrin_local_irq_restore,
-		  /* r8 = input value: 0 or IA64_PSR_I
-		   * p6 =  (flags & IA64_PSR_I)
-		   *    = if clause
-		   * p7 = !(flags & IA64_PSR_I)
-		   *    = else clause
-		   */
-		  "cmp.ne p6, p7 = r8, r0\n"
-		  "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
-		  ";;\n"
-		  /* r9 = XEN_PSR_I_ADDR */
-		  "ld8 r9 = [r9]\n"
-		  ";;\n"
-
-		  /* r10 = masked previous value */
-		  "(p6)	ld1.acq r10 = [r9]\n"
-		  ";;\n"
-
-		  /* p8 = !masked interrupt masked previously? */
-		  "(p6)	cmp.ne.unc p8, p0 = r10, r0\n"
-
-		  /* p7 = else clause */
-		  "(p7)	mov r11 = 1\n"
-		  ";;\n"
-		  /* masked = 1 */
-		  "(p7)	st1.rel [r9] = r11\n"
-
-		  /* p6 = if clause */
-		  /* masked = 0
-		   * r9 = masked_addr - 1
-		   *    = pending_intr_addr
-		   */
-		  "(p8)	st1.rel [r9] = r0, -1\n"
-		  ";;\n"
-		  /* r8 = pending_intr */
-		  "(p8)	ld1.acq r11 = [r9]\n"
-		  ";;\n"
-		  /* p9 = interrupt pending? */
-		  "(p8)	cmp.ne.unc p9, p10 = r11, r0\n"
-		  ";;\n"
-		  "(p10) mf\n"
-		  /* issue hypercall to trigger interrupt */
-		  "(p9)	break " __stringify(HYPERPRIVOP_SSM_I) "\n");
-
-DEFINE_VOID_FUNC2(ptcga,
-		  "break " __stringify(HYPERPRIVOP_PTC_GA) "\n");
-DEFINE_VOID_FUNC2(set_rr,
-		  "break " __stringify(HYPERPRIVOP_SET_RR) "\n");
-
-/*
- * tmp = XEN_MAPPEDREGS->interrupt_mask_addr = XEN_PSR_I_ADDR_ADDR;
- * tmp = *tmp
- * tmp = *tmp;
- * psr_i = tmp? 0: IA64_PSR_I;
- */
-/* 4 bundles */
-DEFINE_FUNC0(get_psr_i,
-	     "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
-	     ";;\n"
-	     "ld8 r9 = [r9]\n"			/* r9 = XEN_PSR_I_ADDR */
-	     "mov r8 = 0\n"			/* psr_i = 0 */
-	     ";;\n"
-	     "ld1.acq r9 = [r9]\n"		/* r9 = XEN_PSR_I */
-	     ";;\n"
-	     "cmp.eq.unc p6, p0 = r9, r0\n"	/* p6 = (XEN_PSR_I != 0) */
-	     ";;\n"
-	     "(p6) mov r8 = " __stringify(1 << IA64_PSR_I_BIT) "\n");
-
-DEFINE_FUNC1(thash, unsigned long,
-	     "break " __stringify(HYPERPRIVOP_THASH) "\n");
-DEFINE_FUNC1(get_cpuid, int,
-	     "break " __stringify(HYPERPRIVOP_GET_CPUID) "\n");
-DEFINE_FUNC1(get_pmd, int,
-	     "break " __stringify(HYPERPRIVOP_GET_PMD) "\n");
-DEFINE_FUNC1(get_rr, unsigned long,
-	     "break " __stringify(HYPERPRIVOP_GET_RR) "\n");
-
-/*
- * void xen_privop_ssm_i(void)
- *
- * int masked = !xen_get_virtual_psr_i();
- *	// masked = *(*XEN_MAPPEDREGS->interrupt_mask_addr)
- * xen_set_virtual_psr_i(1)
- *	// *(*XEN_MAPPEDREGS->interrupt_mask_addr) = 0
- * // compiler barrier
- * if (masked) {
- *	uint8_t* pend_int_addr =
- *		(uint8_t*)(*XEN_MAPPEDREGS->interrupt_mask_addr) - 1;
- *	uint8_t pending = *pend_int_addr;
- *	if (pending)
- *		XEN_HYPER_SSM_I
- * }
- */
-/* 4 bundles */
-DEFINE_VOID_FUNC0(ssm_i,
-		  "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
-		  ";;\n"
-		  "ld8 r8 = [r8]\n"		/* r8 = XEN_PSR_I_ADDR */
-		  ";;\n"
-		  "ld1.acq r9 = [r8]\n"		/* r9 = XEN_PSR_I */
-		  ";;\n"
-		  "st1.rel [r8] = r0, -1\n"	/* psr_i = 0. enable interrupt
-						 * r8 = XEN_PSR_I_ADDR - 1
-						 *    = pend_int_addr
-						 */
-		  "cmp.eq.unc p0, p6 = r9, r0\n"/* p6 = !XEN_PSR_I
-						 * previously interrupt
-						 * masked?
-						 */
-		  ";;\n"
-		  "(p6) ld1.acq r8 = [r8]\n"	/* r8 = xen_pend_int */
-		  ";;\n"
-		  "(p6) cmp.eq.unc p6, p7 = r8, r0\n"	/*interrupt pending?*/
-		  ";;\n"
-		  /* issue hypercall to get interrupt */
-		  "(p7) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
-		  ";;\n");
-
-/*
- * psr_i_addr_addr = XEN_MAPPEDREGS->interrupt_mask_addr
- *		   = XEN_PSR_I_ADDR_ADDR;
- * psr_i_addr = *psr_i_addr_addr;
- * *psr_i_addr = 1;
- */
-/* 2 bundles */
-DEFINE_VOID_FUNC0(rsm_i,
-		  "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
-						/* r8 = XEN_PSR_I_ADDR */
-		  "mov r9 = 1\n"
-		  ";;\n"
-		  "ld8 r8 = [r8]\n"		/* r8 = XEN_PSR_I */
-		  ";;\n"
-		  "st1.rel [r8] = r9\n");	/* XEN_PSR_I = 1 */
-
-extern void
-xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
-		   unsigned long val2, unsigned long val3,
-		   unsigned long val4);
-__DEFINE_FUNC(set_rr0_to_rr4,
-	      "break " __stringify(HYPERPRIVOP_SET_RR0_TO_RR4) "\n");
-
-
-extern unsigned long xen_getreg(int regnum);
-#define __DEFINE_GET_REG(id, privop)					\
-	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"			\
-	";;\n"								\
-	"cmp.eq p6, p0 = r2, r8\n"					\
-	";;\n"								\
-	"(p6) break " __stringify(HYPERPRIVOP_GET_ ## privop) "\n"	\
-	"(p6) br.cond.sptk.many b6\n"					\
-	";;\n"
-
-__DEFINE_FUNC(getreg,
-	      __DEFINE_GET_REG(PSR, PSR)
-#ifdef CONFIG_IA32_SUPPORT
-	      __DEFINE_GET_REG(AR_EFLAG, EFLAG)
-#endif
-
-	      /* get_itc */
-	      "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
-	      ";;\n"
-	      "cmp.eq p6, p0 = r2, r8\n"
-	      ";;\n"
-	      "(p6) br.cond.spnt xen_get_itc\n"
-	      ";;\n"
-
-	      /* get itm */
-	      "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
-	      ";;\n"
-	      "cmp.eq p6, p0 = r2, r8\n"
-	      ";;\n"
-	      "(p6) br.cond.spnt xen_get_itm_with_offset\n"
-	      ";;\n"
-
-	      __DEFINE_GET_REG(CR_IVR, IVR)
-	      __DEFINE_GET_REG(CR_TPR, TPR)
-
-	      /* fall back */
-	      "movl r2 = ia64_native_getreg_func\n"
-	      ";;\n"
-	      "mov b7 = r2\n"
-	      ";;\n"
-	      "br.cond.sptk.many b7\n");
-
-extern void xen_setreg(int regnum, unsigned long val);
-#define __DEFINE_SET_REG(id, privop)					\
-	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"			\
-	";;\n"								\
-	"cmp.eq p6, p0 = r2, r9\n"					\
-	";;\n"								\
-	"(p6) break " __stringify(HYPERPRIVOP_ ## privop) "\n"		\
-	"(p6) br.cond.sptk.many b6\n"					\
-	";;\n"
-
-__DEFINE_FUNC(setreg,
-	      /* kr0 .. kr 7*/
-	      /*
-	       * if (_IA64_REG_AR_KR0 <= regnum &&
-	       *     regnum <= _IA64_REG_AR_KR7) {
-	       *     register __index asm ("r8") = regnum - _IA64_REG_AR_KR0
-	       *     register __val asm ("r9") = val
-	       *    "break HYPERPRIVOP_SET_KR"
-	       * }
-	       */
-	      "mov r17 = r9\n"
-	      "mov r2 = " __stringify(_IA64_REG_AR_KR0) "\n"
-	      ";;\n"
-	      "cmp.ge p6, p0 = r9, r2\n"
-	      "sub r17 = r17, r2\n"
-	      ";;\n"
-	      "(p6) cmp.ge.unc p7, p0 = "
-	      __stringify(_IA64_REG_AR_KR7) " - " __stringify(_IA64_REG_AR_KR0)
-	      ", r17\n"
-	      ";;\n"
-	      "(p7) mov r9 = r8\n"
-	      ";;\n"
-	      "(p7) mov r8 = r17\n"
-	      "(p7) break " __stringify(HYPERPRIVOP_SET_KR) "\n"
-
-	      /* set itm */
-	      "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
-	      ";;\n"
-	      "cmp.eq p6, p0 = r2, r8\n"
-	      ";;\n"
-	      "(p6) br.cond.spnt xen_set_itm_with_offset\n"
-
-	      /* set itc */
-	      "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
-	      ";;\n"
-	      "cmp.eq p6, p0 = r2, r8\n"
-	      ";;\n"
-	      "(p6) br.cond.spnt xen_set_itc\n"
-
-#ifdef CONFIG_IA32_SUPPORT
-	      __DEFINE_SET_REG(AR_EFLAG, SET_EFLAG)
-#endif
-	      __DEFINE_SET_REG(CR_TPR, SET_TPR)
-	      __DEFINE_SET_REG(CR_EOI, EOI)
-
-	      /* fall back */
-	      "movl r2 = ia64_native_setreg_func\n"
-	      ";;\n"
-	      "mov b7 = r2\n"
-	      ";;\n"
-	      "br.cond.sptk.many b7\n");
-#endif
-
-static const struct pv_cpu_ops xen_cpu_ops __initconst = {
+static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.fc		= xen_fc,
 	.thash		= xen_thash,
 	.get_cpuid	= xen_get_cpuid,
@@ -880,7 +337,7 @@ xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
 	HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
 }
 
-static struct pv_iosapic_ops xen_iosapic_ops __initdata = {
+static const struct pv_iosapic_ops xen_iosapic_ops __initconst = {
 	.pcat_compat_init = xen_pcat_compat_init,
 	.__get_irq_chip = xen_iosapic_get_irq_chip,
 
@@ -898,8 +355,6 @@ xen_setup_pv_ops(void)
 	xen_info_init();
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	pv_fsys_data = xen_fsys_data;
-	pv_patchdata = xen_patchdata;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_iosapic_ops = xen_iosapic_ops;
 	pv_irq_ops = xen_irq_ops;
@@ -907,252 +362,3 @@ xen_setup_pv_ops(void)
 
 	paravirt_cpu_asm_init(&xen_cpu_asm_switch);
 }
-
-#ifdef ASM_SUPPORTED
-/***************************************************************************
- * binary pacthing
- * pv_init_ops.patch_bundle
- */
-
-#define DEFINE_FUNC_GETREG(name, privop)				\
-	DEFINE_FUNC0(get_ ## name,					\
-		     "break "__stringify(HYPERPRIVOP_GET_ ## privop) "\n")
-
-DEFINE_FUNC_GETREG(psr, PSR);
-DEFINE_FUNC_GETREG(eflag, EFLAG);
-DEFINE_FUNC_GETREG(ivr, IVR);
-DEFINE_FUNC_GETREG(tpr, TPR);
-
-#define DEFINE_FUNC_SET_KR(n)						\
-	DEFINE_VOID_FUNC0(set_kr ## n,					\
-			  ";;\n"					\
-			  "mov r9 = r8\n"				\
-			  "mov r8 = " #n "\n"				\
-			  "break " __stringify(HYPERPRIVOP_SET_KR) "\n")
-
-DEFINE_FUNC_SET_KR(0);
-DEFINE_FUNC_SET_KR(1);
-DEFINE_FUNC_SET_KR(2);
-DEFINE_FUNC_SET_KR(3);
-DEFINE_FUNC_SET_KR(4);
-DEFINE_FUNC_SET_KR(5);
-DEFINE_FUNC_SET_KR(6);
-DEFINE_FUNC_SET_KR(7);
-
-#define __DEFINE_FUNC_SETREG(name, privop)				\
-	DEFINE_VOID_FUNC0(name,						\
-			  "break "__stringify(HYPERPRIVOP_ ## privop) "\n")
-
-#define DEFINE_FUNC_SETREG(name, privop)			\
-	__DEFINE_FUNC_SETREG(set_ ## name, SET_ ## privop)
-
-DEFINE_FUNC_SETREG(eflag, EFLAG);
-DEFINE_FUNC_SETREG(tpr, TPR);
-__DEFINE_FUNC_SETREG(eoi, EOI);
-
-extern const char xen_check_events[];
-extern const char __xen_intrin_local_irq_restore_direct_start[];
-extern const char __xen_intrin_local_irq_restore_direct_end[];
-extern const unsigned long __xen_intrin_local_irq_restore_direct_reloc;
-
-asm (
-	".align 32\n"
-	".proc xen_check_events\n"
-	"xen_check_events:\n"
-	/* masked = 0
-	 * r9 = masked_addr - 1
-	 *    = pending_intr_addr
-	 */
-	"st1.rel [r9] = r0, -1\n"
-	";;\n"
-	/* r8 = pending_intr */
-	"ld1.acq r11 = [r9]\n"
-	";;\n"
-	/* p9 = interrupt pending? */
-	"cmp.ne p9, p10 = r11, r0\n"
-	";;\n"
-	"(p10) mf\n"
-	/* issue hypercall to trigger interrupt */
-	"(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
-	"br.cond.sptk.many b6\n"
-	".endp xen_check_events\n"
-	"\n"
-	".align 32\n"
-	".proc __xen_intrin_local_irq_restore_direct\n"
-	"__xen_intrin_local_irq_restore_direct:\n"
-	"__xen_intrin_local_irq_restore_direct_start:\n"
-	"1:\n"
-	"{\n"
-	"cmp.ne p6, p7 = r8, r0\n"
-	"mov r17 = ip\n" /* get ip to calc return address */
-	"mov r9 = "__stringify(XEN_PSR_I_ADDR_ADDR) "\n"
-	";;\n"
-	"}\n"
-	"{\n"
-	/* r9 = XEN_PSR_I_ADDR */
-	"ld8 r9 = [r9]\n"
-	";;\n"
-	/* r10 = masked previous value */
-	"(p6) ld1.acq r10 = [r9]\n"
-	"adds r17 =  1f - 1b, r17\n" /* calculate return address */
-	";;\n"
-	"}\n"
-	"{\n"
-	/* p8 = !masked interrupt masked previously? */
-	"(p6) cmp.ne.unc p8, p0 = r10, r0\n"
-	"\n"
-	/* p7 = else clause */
-	"(p7) mov r11 = 1\n"
-	";;\n"
-	"(p8) mov b6 = r17\n" /* set return address */
-	"}\n"
-	"{\n"
-	/* masked = 1 */
-	"(p7) st1.rel [r9] = r11\n"
-	"\n"
-	"[99:]\n"
-	"(p8) brl.cond.dptk.few xen_check_events\n"
-	"}\n"
-	/* pv calling stub is 5 bundles. fill nop to adjust return address */
-	"{\n"
-	"nop 0\n"
-	"nop 0\n"
-	"nop 0\n"
-	"}\n"
-	"1:\n"
-	"__xen_intrin_local_irq_restore_direct_end:\n"
-	".endp __xen_intrin_local_irq_restore_direct\n"
-	"\n"
-	".align 8\n"
-	"__xen_intrin_local_irq_restore_direct_reloc:\n"
-	"data8 99b\n"
-);
-
-static struct paravirt_patch_bundle_elem xen_patch_bundle_elems[]
-__initdata_or_module =
-{
-#define XEN_PATCH_BUNDLE_ELEM(name, type)		\
-	{						\
-		(void*)xen_ ## name ## _direct_start,	\
-		(void*)xen_ ## name ## _direct_end,	\
-		PARAVIRT_PATCH_TYPE_ ## type,		\
-	}
-
-	XEN_PATCH_BUNDLE_ELEM(fc, FC),
-	XEN_PATCH_BUNDLE_ELEM(thash, THASH),
-	XEN_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
-	XEN_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
-	XEN_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
-	XEN_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
-	XEN_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
-	XEN_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
-	XEN_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
-	XEN_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
-	XEN_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
-	{
-		(void*)__xen_intrin_local_irq_restore_direct_start,
-		(void*)__xen_intrin_local_irq_restore_direct_end,
-		PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE,
-	},
-
-#define XEN_PATCH_BUNDLE_ELEM_GETREG(name, reg)			\
-	{							\
-		xen_get_ ## name ## _direct_start,		\
-		xen_get_ ## name ## _direct_end,		\
-		PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \
-	}
-
-	XEN_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
-	XEN_PATCH_BUNDLE_ELEM_GETREG(eflag, AR_EFLAG),
-
-	XEN_PATCH_BUNDLE_ELEM_GETREG(ivr, CR_IVR),
-	XEN_PATCH_BUNDLE_ELEM_GETREG(tpr, CR_TPR),
-
-	XEN_PATCH_BUNDLE_ELEM_GETREG(itc, AR_ITC),
-	XEN_PATCH_BUNDLE_ELEM_GETREG(itm_with_offset, CR_ITM),
-
-
-#define __XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg)		\
-	{							\
-		xen_ ## name ## _direct_start,			\
-		xen_ ## name ## _direct_end,			\
-		PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \
-	}
-
-#define XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg)			\
-	__XEN_PATCH_BUNDLE_ELEM_SETREG(set_ ## name, reg)
-
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr0, AR_KR0),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr1, AR_KR1),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr2, AR_KR2),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr3, AR_KR3),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr4, AR_KR4),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr5, AR_KR5),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr6, AR_KR6),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(kr7, AR_KR7),
-
-	XEN_PATCH_BUNDLE_ELEM_SETREG(eflag, AR_EFLAG),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(tpr, CR_TPR),
-	__XEN_PATCH_BUNDLE_ELEM_SETREG(eoi, CR_EOI),
-
-	XEN_PATCH_BUNDLE_ELEM_SETREG(itc, AR_ITC),
-	XEN_PATCH_BUNDLE_ELEM_SETREG(itm_with_offset, CR_ITM),
-};
-
-static unsigned long __init_or_module
-xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
-{
-	const unsigned long nelems = sizeof(xen_patch_bundle_elems) /
-		sizeof(xen_patch_bundle_elems[0]);
-	unsigned long used;
-	const struct paravirt_patch_bundle_elem *found;
-
-	used = __paravirt_patch_apply_bundle(sbundle, ebundle, type,
-					     xen_patch_bundle_elems, nelems,
-					     &found);
-
-	if (found == NULL)
-		/* fallback */
-		return ia64_native_patch_bundle(sbundle, ebundle, type);
-	if (used == 0)
-		return used;
-
-	/* relocation */
-	switch (type) {
-	case PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE: {
-		unsigned long reloc =
-			__xen_intrin_local_irq_restore_direct_reloc;
-		unsigned long reloc_offset = reloc - (unsigned long)
-			__xen_intrin_local_irq_restore_direct_start;
-		unsigned long tag = (unsigned long)sbundle + reloc_offset;
-		paravirt_patch_reloc_brl(tag, xen_check_events);
-		break;
-	}
-	default:
-		/* nothing */
-		break;
-	}
-	return used;
-}
-#endif /* ASM_SUPPOTED */
-
-const struct paravirt_patch_branch_target xen_branch_target[]
-__initconst = {
-#define PARAVIRT_BR_TARGET(name, type)			\
-	{						\
-		&xen_ ## name,				\
-		PARAVIRT_PATCH_TYPE_BR_ ## type,	\
-	}
-	PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
-	PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
-	PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
-	PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
-};
-
-static void __init
-xen_patch_branch(unsigned long tag, unsigned long type)
-{
-	const unsigned long nelem =
-		sizeof(xen_branch_target) / sizeof(xen_branch_target[0]);
-	__paravirt_patch_apply_branch(tag, type, xen_branch_target, nelem);
-}
diff --git a/trunk/arch/x86/boot/memory.c b/trunk/arch/x86/boot/memory.c
index 5054c2ddd1a0..8c3c25f35578 100644
--- a/trunk/arch/x86/boot/memory.c
+++ b/trunk/arch/x86/boot/memory.c
@@ -2,7 +2,6 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -17,38 +16,24 @@
 
 #define SMAP	0x534d4150	/* ASCII "SMAP" */
 
-struct e820_ext_entry {
-	struct e820entry std;
-	u32 ext_flags;
-} __attribute__((packed));
-
 static int detect_memory_e820(void)
 {
 	int count = 0;
 	u32 next = 0;
-	u32 size, id, edi;
+	u32 size, id;
 	u8 err;
 	struct e820entry *desc = boot_params.e820_map;
-	static struct e820_ext_entry buf; /* static so it is zeroed */
-
-	/*
-	 * Set this here so that if the BIOS doesn't change this field
-	 * but still doesn't change %ecx, we're still okay...
-	 */
-	buf.ext_flags = 1;
 
 	do {
-		size = sizeof buf;
+		size = sizeof(struct e820entry);
 
-		/* Important: %edx and %esi are clobbered by some BIOSes,
-		   so they must be either used for the error output
-		   or explicitly marked clobbered.  Given that, assume there
-		   is something out there clobbering %ebp and %edi, too. */
-		asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
+		/* Important: %edx is clobbered by some BIOSes,
+		   so it must be either used for the error output
+		   or explicitly marked clobbered. */
+		asm("int $0x15; setc %0"
 		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-		      "=D" (edi), "+m" (buf)
-		    : "D" (&buf), "d" (SMAP), "a" (0xe820)
-		    : "esi");
+		      "=m" (*desc)
+		    : "D" (desc), "d" (SMAP), "a" (0xe820));
 
 		/* BIOSes which terminate the chain with CF = 1 as opposed
 		   to %ebx = 0 don't always report the SMAP signature on
@@ -66,14 +51,8 @@ static int detect_memory_e820(void)
 			break;
 		}
 
-		/* ACPI 3.0 added the extended flags support.  If bit 0
-		   in the extended flags is zero, we're supposed to simply
-		   ignore the entry -- a backwards incompatible change! */
-		if (size > 20 && !(buf.ext_flags & 1))
-			continue;
-
-		*desc++ = buf.std;
 		count++;
+		desc++;
 	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
diff --git a/trunk/drivers/gpu/drm/drm_crtc_helper.c b/trunk/drivers/gpu/drm/drm_crtc_helper.c
index a04639dc633d..1c3a8c557140 100644
--- a/trunk/drivers/gpu/drm/drm_crtc_helper.c
+++ b/trunk/drivers/gpu/drm/drm_crtc_helper.c
@@ -42,26 +42,6 @@ static struct drm_display_mode std_modes[] = {
 		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) },
 };
 
-static void drm_mode_validate_flag(struct drm_connector *connector,
-				   int flags)
-{
-	struct drm_display_mode *mode, *t;
-
-	if (flags == (DRM_MODE_FLAG_DBLSCAN | DRM_MODE_FLAG_INTERLACE))
-		return;
-
-	list_for_each_entry_safe(mode, t, &connector->modes, head) {
-		if ((mode->flags & DRM_MODE_FLAG_INTERLACE) &&
-				!(flags & DRM_MODE_FLAG_INTERLACE))
-			mode->status = MODE_NO_INTERLACE;
-		if ((mode->flags & DRM_MODE_FLAG_DBLSCAN) &&
-				!(flags & DRM_MODE_FLAG_DBLSCAN))
-			mode->status = MODE_NO_DBLESCAN;
-	}
-
-	return;
-}
-
 /**
  * drm_helper_probe_connector_modes - get complete set of display modes
  * @dev: DRM device
@@ -92,7 +72,6 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
 	struct drm_connector_helper_funcs *connector_funcs =
 		connector->helper_private;
 	int count = 0;
-	int mode_flags = 0;
 
 	DRM_DEBUG("%s\n", drm_get_connector_name(connector));
 	/* set all modes to the unverified state */
@@ -117,13 +96,6 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
 	if (maxX && maxY)
 		drm_mode_validate_size(dev, &connector->modes, maxX,
 				       maxY, 0);
-
-	if (connector->interlace_allowed)
-		mode_flags |= DRM_MODE_FLAG_INTERLACE;
-	if (connector->doublescan_allowed)
-		mode_flags |= DRM_MODE_FLAG_DBLSCAN;
-	drm_mode_validate_flag(connector, mode_flags);
-
 	list_for_each_entry_safe(mode, t, &connector->modes, head) {
 		if (mode->status == MODE_OK)
 			mode->status = connector_funcs->mode_valid(connector,
@@ -913,6 +885,7 @@ bool drm_helper_plugged_event(struct drm_device *dev)
 /**
  * drm_initial_config - setup a sane initial connector configuration
  * @dev: DRM device
+ * @can_grow: this configuration is growable
  *
  * LOCKING:
  * Called at init time, must take mode config lock.
@@ -924,7 +897,7 @@ bool drm_helper_plugged_event(struct drm_device *dev)
  * RETURNS:
  * Zero if everything went ok, nonzero otherwise.
  */
-bool drm_helper_initial_config(struct drm_device *dev)
+bool drm_helper_initial_config(struct drm_device *dev, bool can_grow)
 {
 	struct drm_connector *connector;
 	int count = 0;
diff --git a/trunk/drivers/gpu/drm/drm_edid.c b/trunk/drivers/gpu/drm/drm_edid.c
index ca9c61656714..c67400067b85 100644
--- a/trunk/drivers/gpu/drm/drm_edid.c
+++ b/trunk/drivers/gpu/drm/drm_edid.c
@@ -125,8 +125,10 @@ static bool edid_is_valid(struct edid *edid)
 		DRM_ERROR("EDID has major version %d, instead of 1\n", edid->version);
 		goto bad;
 	}
-	if (edid->revision > 4)
-		DRM_DEBUG("EDID minor > 4, assuming backward compatibility\n");
+	if (edid->revision > 3) {
+		DRM_ERROR("EDID has minor version %d, which is not between 0-3\n", edid->revision);
+		goto bad;
+	}
 
 	for (i = 0; i < EDID_LENGTH; i++)
 		csum += raw_edid[i];
@@ -160,7 +162,7 @@ static bool edid_vendor(struct edid *edid, char *vendor)
 	edid_vendor[0] = ((edid->mfg_id[0] & 0x7c) >> 2) + '@';
 	edid_vendor[1] = (((edid->mfg_id[0] & 0x3) << 3) |
 			  ((edid->mfg_id[1] & 0xe0) >> 5)) + '@';
-	edid_vendor[2] = (edid->mfg_id[1] & 0x1f) + '@';
+	edid_vendor[2] = (edid->mfg_id[2] & 0x1f) + '@';
 
 	return !strncmp(edid_vendor, vendor, 3);
 }
diff --git a/trunk/drivers/gpu/drm/i915/i915_dma.c b/trunk/drivers/gpu/drm/i915/i915_dma.c
index 9648d79fe36d..85549f615b1f 100644
--- a/trunk/drivers/gpu/drm/i915/i915_dma.c
+++ b/trunk/drivers/gpu/drm/i915/i915_dma.c
@@ -1049,7 +1049,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
 
 	intel_modeset_init(dev);
 
-	drm_helper_initial_config(dev);
+	drm_helper_initial_config(dev, false);
 
 	return 0;
 
diff --git a/trunk/drivers/gpu/drm/i915/i915_gem.c b/trunk/drivers/gpu/drm/i915/i915_gem.c
index e0389ad1477d..0abccb761121 100644
--- a/trunk/drivers/gpu/drm/i915/i915_gem.c
+++ b/trunk/drivers/gpu/drm/i915/i915_gem.c
@@ -1990,20 +1990,23 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg)
 	int regnum = obj_priv->fence_reg;
 	uint32_t val;
 	uint32_t pitch_val;
+	uint32_t fence_size_bits;
 
-	if ((obj_priv->gtt_offset & ~I915_FENCE_START_MASK) ||
+	if ((obj_priv->gtt_offset & ~I830_FENCE_START_MASK) ||
 	    (obj_priv->gtt_offset & (obj->size - 1))) {
-		WARN(1, "%s: object 0x%08x not 1M or size aligned\n",
+		WARN(1, "%s: object 0x%08x not 512K or size aligned\n",
 		     __func__, obj_priv->gtt_offset);
 		return;
 	}
 
 	pitch_val = (obj_priv->stride / 128) - 1;
-
+	WARN_ON(pitch_val & ~0x0000000f);
 	val = obj_priv->gtt_offset;
 	if (obj_priv->tiling_mode == I915_TILING_Y)
 		val |= 1 << I830_FENCE_TILING_Y_SHIFT;
-	val |= I830_FENCE_SIZE_BITS(obj->size);
+	fence_size_bits = I830_FENCE_SIZE_BITS(obj->size);
+	WARN_ON(fence_size_bits & ~0x00000f00);
+	val |= fence_size_bits;
 	val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 	val |= I830_FENCE_REG_VALID;
 
@@ -2194,7 +2197,7 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
 		return -EBUSY;
 	if (alignment == 0)
 		alignment = i915_gem_get_gtt_alignment(obj);
-	if (alignment & (PAGE_SIZE - 1)) {
+	if (alignment & (i915_gem_get_gtt_alignment(obj) - 1)) {
 		DRM_ERROR("Invalid object alignment requested %u\n", alignment);
 		return -EINVAL;
 	}
diff --git a/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c b/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c
index 4cce1aef438e..6be3f927c86a 100644
--- a/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -216,6 +216,22 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode)
 	else
 		tile_width = 512;
 
+	/* check maximum stride & object size */
+	if (IS_I965G(dev)) {
+		/* i965 stores the end address of the gtt mapping in the fence
+		 * reg, so dont bother to check the size */
+		if (stride / 128 > I965_FENCE_MAX_PITCH_VAL)
+			return false;
+	} else if (IS_I9XX(dev)) {
+		if (stride / tile_width > I830_FENCE_MAX_PITCH_VAL ||
+		    size > (I830_FENCE_MAX_SIZE_VAL << 20))
+			return false;
+	} else {
+		if (stride / 128 > I830_FENCE_MAX_PITCH_VAL ||
+		    size > (I830_FENCE_MAX_SIZE_VAL << 19))
+			return false;
+	}
+
 	/* 965+ just needs multiples of tile width */
 	if (IS_I965G(dev)) {
 		if (stride & (tile_width - 1))
diff --git a/trunk/drivers/gpu/drm/i915/i915_reg.h b/trunk/drivers/gpu/drm/i915/i915_reg.h
index 377cc588f5e9..83357b09e546 100644
--- a/trunk/drivers/gpu/drm/i915/i915_reg.h
+++ b/trunk/drivers/gpu/drm/i915/i915_reg.h
@@ -190,6 +190,8 @@
 #define   I830_FENCE_SIZE_BITS(size)	((ffs((size) >> 19) - 1) << 8)
 #define   I830_FENCE_PITCH_SHIFT	4
 #define   I830_FENCE_REG_VALID		(1<<0)
+#define   I830_FENCE_MAX_PITCH_VAL	0x10
+#define   I830_FENCE_MAX_SIZE_VAL	(1<<8)
 
 #define   I915_FENCE_START_MASK		0x0ff00000
 #define   I915_FENCE_SIZE_BITS(size)	((ffs((size) >> 20) - 1) << 8)
@@ -198,6 +200,7 @@
 #define   I965_FENCE_PITCH_SHIFT	2
 #define   I965_FENCE_TILING_Y_SHIFT	1
 #define   I965_FENCE_REG_VALID		(1<<0)
+#define   I965_FENCE_MAX_PITCH_VAL	0x0400
 
 /*
  * Instruction and interrupt control regs
diff --git a/trunk/drivers/s390/net/qeth_core_offl.c b/trunk/drivers/s390/net/qeth_core_offl.c
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/trunk/drivers/s390/net/qeth_core_offl.h b/trunk/drivers/s390/net/qeth_core_offl.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/trunk/drivers/serial/serial_core.c b/trunk/drivers/serial/serial_core.c
index b0bb29d804ae..bf3c0e32a334 100644
--- a/trunk/drivers/serial/serial_core.c
+++ b/trunk/drivers/serial/serial_core.c
@@ -1765,7 +1765,7 @@ static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
 
 static int uart_proc_show(struct seq_file *m, void *v)
 {
-	struct tty_driver *ttydrv = m->private;
+	struct tty_driver *ttydrv = v;
 	struct uart_driver *drv = ttydrv->driver_state;
 	int i;
 
diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile
index 9adf5e4f7e96..d2cf5a54a4b8 100644
--- a/trunk/fs/btrfs/Makefile
+++ b/trunk/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-	   compression.o delayed-ref.o
+	   compression.o
 else
 
 # Normal Makefile
diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h
index b30986f00b9d..72677ce2b74f 100644
--- a/trunk/fs/btrfs/btrfs_inode.h
+++ b/trunk/fs/btrfs/btrfs_inode.h
@@ -66,12 +66,6 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
-	/*
-	 * list for tracking inodes that must be sent to disk before a
-	 * rename or truncate commit
-	 */
-	struct list_head ordered_operations;
-
 	/* the space_info for where this inode's data allocations are done */
 	struct btrfs_space_info *space_info;
 
@@ -92,6 +86,12 @@ struct btrfs_inode {
 	 */
 	u64 logged_trans;
 
+	/*
+	 * trans that last made a change that should be fully fsync'd.  This
+	 * gets reset to zero each time the inode is logged
+	 */
+	u64 log_dirty_trans;
+
 	/* total number of bytes pending delalloc, used by stat to calc the
 	 * real block usage of the file
 	 */
@@ -121,25 +121,6 @@ struct btrfs_inode {
 	/* the start of block group preferred for allocations. */
 	u64 block_group;
 
-	/* the fsync log has some corner cases that mean we have to check
-	 * directories to see if any unlinks have been done before
-	 * the directory was logged.  See tree-log.c for all the
-	 * details
-	 */
-	u64 last_unlink_trans;
-
-	/*
-	 * ordered_data_close is set by truncate when a file that used
-	 * to have good data has been truncated to zero.  When it is set
-	 * the btrfs file release call will add this inode to the
-	 * ordered operations list so that we make sure to flush out any
-	 * new data the application may have written before commit.
-	 *
-	 * yes, its silly to have a single bitflag, but we might grow more
-	 * of these.
-	 */
-	unsigned ordered_data_close:1;
-
 	struct inode vfs_inode;
 };
 
diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c
index dbb724124633..37f31b5529aa 100644
--- a/trunk/fs/btrfs/ctree.c
+++ b/trunk/fs/btrfs/ctree.c
@@ -254,13 +254,18 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * empty_size -- a hint that you plan on doing more cow.  This is the size in
  * bytes the allocator should try to find free next to the block it returns.
  * This is just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
 			     struct extent_buffer **cow_ret,
-			     u64 search_start, u64 empty_size)
+			     u64 search_start, u64 empty_size,
+			     u64 prealloc_dest)
 {
 	u64 parent_start;
 	struct extent_buffer *cow;
@@ -286,10 +291,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 
-	cow = btrfs_alloc_free_block(trans, root, buf->len,
-				     parent_start, root->root_key.objectid,
-				     trans->transid, level,
-				     search_start, empty_size);
+	if (prealloc_dest) {
+		struct btrfs_key ins;
+
+		ins.objectid = prealloc_dest;
+		ins.offset = buf->len;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
+						  root->root_key.objectid,
+						  trans->transid, level, &ins);
+		BUG_ON(ret);
+		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
+					    buf->len, level);
+	} else {
+		cow = btrfs_alloc_free_block(trans, root, buf->len,
+					     parent_start,
+					     root->root_key.objectid,
+					     trans->transid, level,
+					     search_start, empty_size);
+	}
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -392,7 +413,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret)
+		    struct extent_buffer **cow_ret, u64 prealloc_dest)
 {
 	u64 search_start;
 	int ret;
@@ -415,6 +436,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
+		WARN_ON(prealloc_dest);
 		return 0;
 	}
 
@@ -425,7 +447,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	btrfs_set_lock_blocking(buf);
 
 	ret = __btrfs_cow_block(trans, root, buf, parent,
-				 parent_slot, cow_ret, search_start, 0);
+				 parent_slot, cow_ret, search_start, 0,
+				 prealloc_dest);
 	return ret;
 }
 
@@ -594,7 +617,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
-					    (end_slot - i) * blocksize));
+					    (end_slot - i) * blocksize), 0);
 		if (err) {
 			btrfs_tree_unlock(cur);
 			free_extent_buffer(cur);
@@ -914,7 +937,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		BUG_ON(!child);
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
-		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
 
 		spin_lock(&root->node_lock);
@@ -922,7 +945,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		spin_unlock(&root->node_lock);
 
 		ret = btrfs_update_extent_ref(trans, root, child->start,
-					      child->len,
 					      mid->start, child->start,
 					      root->root_key.objectid,
 					      trans->transid, level - 1);
@@ -949,10 +971,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (trans->transaction->delayed_refs.flushing &&
-	    btrfs_header_nritems(mid) > 2)
-		return 0;
-
 	if (btrfs_header_nritems(mid) < 2)
 		err_on_enospc = 1;
 
@@ -961,7 +979,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(left);
 		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
-				       parent, pslot - 1, &left);
+				       parent, pslot - 1, &left, 0);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -972,7 +990,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(right);
 		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
-				       parent, pslot + 1, &right);
+				       parent, pslot + 1, &right, 0);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -1153,7 +1171,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			wret = 1;
 		} else {
 			ret = btrfs_cow_block(trans, root, left, parent,
-					      pslot - 1, &left);
+					      pslot - 1, &left, 0);
 			if (ret)
 				wret = 1;
 			else {
@@ -1204,7 +1222,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		} else {
 			ret = btrfs_cow_block(trans, root, right,
 					      parent, pslot + 1,
-					      &right);
+					      &right, 0);
 			if (ret)
 				wret = 1;
 			else {
@@ -1474,6 +1492,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	u8 lowest_level = 0;
 	u64 blocknr;
 	u64 gen;
+	struct btrfs_key prealloc_block;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
@@ -1482,6 +1501,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ins_len < 0)
 		lowest_unlock = 2;
 
+	prealloc_block.objectid = 0;
+
 again:
 	if (p->skip_locking)
 		b = btrfs_root_node(root);
@@ -1508,11 +1529,44 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
 				goto cow_done;
 			}
+
+			/* ok, we have to cow, is our old prealloc the right
+			 * size?
+			 */
+			if (prealloc_block.objectid &&
+			    prealloc_block.offset != b->len) {
+				btrfs_release_path(root, p);
+				btrfs_free_reserved_extent(root,
+					   prealloc_block.objectid,
+					   prealloc_block.offset);
+				prealloc_block.objectid = 0;
+				goto again;
+			}
+
+			/*
+			 * for higher level blocks, try not to allocate blocks
+			 * with the block and the parent locks held.
+			 */
+			if (level > 0 && !prealloc_block.objectid) {
+				u32 size = b->len;
+				u64 hint = b->start;
+
+				btrfs_release_path(root, p);
+				ret = btrfs_reserve_extent(trans, root,
+							   size, size, 0,
+							   hint, (u64)-1,
+							   &prealloc_block, 0);
+				BUG_ON(ret);
+				goto again;
+			}
+
 			btrfs_set_path_blocking(p);
 
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
-					       p->slots[level + 1], &b);
+					       p->slots[level + 1],
+					       &b, prealloc_block.objectid);
+			prealloc_block.objectid = 0;
 			if (wret) {
 				free_extent_buffer(b);
 				ret = wret;
@@ -1688,8 +1742,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	 * we don't really know what they plan on doing with the path
 	 * from here on, so for now just mark it as blocking
 	 */
-	if (!p->leave_spinning)
-		btrfs_set_path_blocking(p);
+	btrfs_set_path_blocking(p);
+	if (prealloc_block.objectid) {
+		btrfs_free_reserved_extent(root,
+			   prealloc_block.objectid,
+			   prealloc_block.offset);
+	}
 	return ret;
 }
 
@@ -1710,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 	int ret;
 
 	eb = btrfs_lock_root_node(root);
-	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
 	BUG_ON(ret);
 
 	btrfs_set_lock_blocking(eb);
@@ -1768,7 +1826,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
-					      &eb);
+					      &eb, 0);
 			BUG_ON(ret);
 
 			if (root->root_key.objectid ==
@@ -2081,7 +2139,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	spin_unlock(&root->node_lock);
 
 	ret = btrfs_update_extent_ref(trans, root, lower->start,
-				      lower->len, lower->start, c->start,
+				      lower->start, c->start,
 				      root->root_key.objectid,
 				      trans->transid, level - 1);
 	BUG_ON(ret);
@@ -2163,7 +2221,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
-	} else if (!trans->transaction->delayed_refs.flushing) {
+	} else {
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
@@ -2271,27 +2329,66 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	return ret;
 }
 
-static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      int data_size, int empty,
-				      struct extent_buffer *right,
-				      int free_space, u32 left_nritems)
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path, int data_size,
+			   int empty)
 {
 	struct extent_buffer *left = path->nodes[0];
-	struct extent_buffer *upper = path->nodes[1];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
 	struct btrfs_disk_key disk_key;
 	int slot;
 	u32 i;
+	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
+	u32 left_nritems;
 	u32 nr;
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
 	int ret;
 
+	slot = path->slots[1];
+	if (!path->nodes[1])
+		return 1;
+
+	upper = path->nodes[1];
+	if (slot >= btrfs_header_nritems(upper) - 1)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	right = read_node_slot(root, upper, slot + 1);
+	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right, 0);
+	if (ret)
+		goto out_unlock;
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	left_nritems = btrfs_header_nritems(left);
+	if (left_nritems == 0)
+		goto out_unlock;
+
 	if (empty)
 		nr = 0;
 	else
@@ -2300,7 +2397,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	if (path->slots[0] >= left_nritems)
 		push_space += data_size;
 
-	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
 		item = btrfs_item_nr(left, i);
@@ -2431,83 +2527,25 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	return 1;
 }
 
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int data_size,
-			   int empty)
-{
-	struct extent_buffer *left = path->nodes[0];
-	struct extent_buffer *right;
-	struct extent_buffer *upper;
-	int slot;
-	int free_space;
-	u32 left_nritems;
-	int ret;
-
-	if (!path->nodes[1])
-		return 1;
-
-	slot = path->slots[1];
-	upper = path->nodes[1];
-	if (slot >= btrfs_header_nritems(upper) - 1)
-		return 1;
-
-	btrfs_assert_tree_locked(path->nodes[1]);
-
-	right = read_node_slot(root, upper, slot + 1);
-	btrfs_tree_lock(right);
-	btrfs_set_lock_blocking(right);
-
-	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size)
-		goto out_unlock;
-
-	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, right, upper,
-			      slot + 1, &right);
-	if (ret)
-		goto out_unlock;
-
-	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size)
-		goto out_unlock;
-
-	left_nritems = btrfs_header_nritems(left);
-	if (left_nritems == 0)
-		goto out_unlock;
-
-	return __push_leaf_right(trans, root, path, data_size, empty,
-				right, free_space, left_nritems);
-out_unlock:
-	btrfs_tree_unlock(right);
-	free_extent_buffer(right);
-	return 1;
-}
-
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     struct btrfs_path *path, int data_size,
-				     int empty, struct extent_buffer *left,
-				     int free_space, int right_nritems)
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int data_size,
+			  int empty)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
 	int slot;
 	int i;
+	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
 	u32 old_left_nritems;
+	u32 right_nritems;
 	u32 nr;
 	int ret = 0;
 	int wret;
@@ -2515,6 +2553,41 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	u32 old_left_item_size;
 
 	slot = path->slots[1];
+	if (slot == 0)
+		return 1;
+	if (!path->nodes[1])
+		return 1;
+
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	left = read_node_slot(root, path->nodes[1], slot - 1);
+	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left, 0);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		ret = 1;
+		goto out;
+	}
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
 
 	if (empty)
 		nr = right_nritems;
@@ -2681,154 +2754,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * push some data in the path leaf to the left, trying to free up at
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
- */
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int data_size,
-			  int empty)
-{
-	struct extent_buffer *right = path->nodes[0];
-	struct extent_buffer *left;
-	int slot;
-	int free_space;
-	u32 right_nritems;
-	int ret = 0;
-
-	slot = path->slots[1];
-	if (slot == 0)
-		return 1;
-	if (!path->nodes[1])
-		return 1;
-
-	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0)
-		return 1;
-
-	btrfs_assert_tree_locked(path->nodes[1]);
-
-	left = read_node_slot(root, path->nodes[1], slot - 1);
-	btrfs_tree_lock(left);
-	btrfs_set_lock_blocking(left);
-
-	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size) {
-		ret = 1;
-		goto out;
-	}
-
-	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, left,
-			      path->nodes[1], slot - 1, &left);
-	if (ret) {
-		/* we hit -ENOSPC, but it isn't fatal here */
-		ret = 1;
-		goto out;
-	}
-
-	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size) {
-		ret = 1;
-		goto out;
-	}
-
-	return __push_leaf_left(trans, root, path, data_size,
-			       empty, left, free_space, right_nritems);
-out:
-	btrfs_tree_unlock(left);
-	free_extent_buffer(left);
-	return ret;
-}
-
-/*
- * split the path's leaf in two, making sure there is at least data_size
- * available for the resulting leaf level of the path.
- *
- * returns 0 if all went well and < 0 on failure.
- */
-static noinline int copy_for_split(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct btrfs_path *path,
-			       struct extent_buffer *l,
-			       struct extent_buffer *right,
-			       int slot, int mid, int nritems)
-{
-	int data_copy_size;
-	int rt_data_off;
-	int i;
-	int ret = 0;
-	int wret;
-	struct btrfs_disk_key disk_key;
-
-	nritems = nritems - mid;
-	btrfs_set_header_nritems(right, nritems);
-	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
-	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
-			   btrfs_item_nr_offset(mid),
-			   nritems * sizeof(struct btrfs_item));
-
-	copy_extent_buffer(right, l,
-		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-		     data_copy_size, btrfs_leaf_data(l) +
-		     leaf_data_end(root, l), data_copy_size);
-
-	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-		      btrfs_item_end_nr(l, mid);
-
-	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(right, i);
-		u32 ioff;
-
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
-
-		ioff = btrfs_item_offset(right, item);
-		btrfs_set_item_offset(right, item, ioff + rt_data_off);
-	}
-
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
-	}
-
-	btrfs_set_header_nritems(l, mid);
-	ret = 0;
-	btrfs_item_key(right, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key, right->start,
-			  path->slots[1] + 1, 1);
-	if (wret)
-		ret = wret;
-
-	btrfs_mark_buffer_dirty(right);
-	btrfs_mark_buffer_dirty(l);
-	BUG_ON(path->slots[0] != slot);
-
-	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
-	BUG_ON(ret);
-
-	if (mid <= slot) {
-		btrfs_tree_unlock(path->nodes[0]);
-		free_extent_buffer(path->nodes[0]);
-		path->nodes[0] = right;
-		path->slots[0] -= mid;
-		path->slots[1] += 1;
-	} else {
-		btrfs_tree_unlock(right);
-		free_extent_buffer(right);
-	}
-
-	BUG_ON(path->slots[0] < 0);
-
-	return ret;
-}
-
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
@@ -2846,14 +2771,17 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int mid;
 	int slot;
 	struct extent_buffer *right;
+	int data_copy_size;
+	int rt_data_off;
+	int i;
 	int ret = 0;
 	int wret;
 	int double_split;
 	int num_doubles = 0;
+	struct btrfs_disk_key disk_key;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
-	    !trans->transaction->delayed_refs.flushing) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0)
 			return wret;
@@ -2902,14 +2830,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
-
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot >= nritems) {
-				struct btrfs_disk_key disk_key;
-
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2937,8 +2862,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		if (leaf_space_used(l, 0, mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (!extend && data_size && slot == 0) {
-				struct btrfs_disk_key disk_key;
-
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2971,16 +2894,76 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			}
 		}
 	}
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
+
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end_nr(l, mid);
+
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(right, i);
+		u32 ioff;
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(right, item);
+		btrfs_set_item_offset(right, item, ioff + rt_data_off);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_set_header_nritems(l, mid);
+	ret = 0;
+	btrfs_item_key(right, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key, right->start,
+			  path->slots[1] + 1, 1);
+	if (wret)
+		ret = wret;
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
+	BUG_ON(path->slots[0] != slot);
 
-	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
 	BUG_ON(ret);
 
+	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+
+	BUG_ON(path->slots[0] < 0);
+
 	if (double_split) {
 		BUG_ON(num_doubles != 0);
 		num_doubles++;
 		goto again;
 	}
-
 	return ret;
 }
 
@@ -3038,27 +3021,26 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 		return -EAGAIN;
 	}
 
-	btrfs_set_path_blocking(path);
 	ret = split_leaf(trans, root, &orig_key, path,
 			 sizeof(struct btrfs_item), 1);
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
-	btrfs_unlock_up_safe(path, 1);
-	leaf = path->nodes[0];
-	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
 	/*
 	 * make sure any changes to the path from split_leaf leave it
 	 * in a blocking state
 	 */
 	btrfs_set_path_blocking(path);
 
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
 	item = btrfs_item_nr(leaf, path->slots[0]);
 	orig_offset = btrfs_item_offset(leaf, item);
 	item_size = btrfs_item_size(leaf, item);
 
+
 	buf = kmalloc(item_size, GFP_NOFS);
 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
 			    path->slots[0]), item_size);
@@ -3463,27 +3445,39 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 }
 
 /*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
  */
-static noinline_for_stack int
-setup_items_for_insert(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, struct btrfs_path *path,
-		      struct btrfs_key *cpu_key, u32 *data_size,
-		      u32 total_data, u32 total_size, int nr)
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
 {
+	struct extent_buffer *leaf;
 	struct btrfs_item *item;
+	int ret = 0;
+	int slot;
+	int slot_orig;
 	int i;
 	u32 nritems;
+	u32 total_size = 0;
+	u32 total_data = 0;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
-	int ret;
-	struct extent_buffer *leaf;
-	int slot;
 
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
-	slot = path->slots[0];
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
@@ -3495,6 +3489,9 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
 	if (slot != nritems) {
 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
@@ -3550,60 +3547,21 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 		data_end -= data_size[i];
 		btrfs_set_item_size(leaf, item, data_size[i]);
 	}
-
 	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
 	if (slot == 0) {
-		struct btrfs_disk_key disk_key;
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	}
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(root, leaf) < 0) {
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return ret;
-}
-
-/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- */
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
-{
-	struct extent_buffer *leaf;
-	int ret = 0;
-	int slot;
-	int i;
-	u32 total_size = 0;
-	u32 total_data = 0;
-
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
-
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	leaf = path->nodes[0];
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
-	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
-			       total_data, total_size, nr);
-
 out:
+	btrfs_unlock_up_safe(path, 1);
 	return ret;
 }
 
@@ -3791,8 +3749,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
-		    !trans->transaction->delayed_refs.flushing) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
@@ -3800,7 +3757,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			extent_buffer_get(leaf);
 
-			btrfs_set_path_blocking(path);
 			wret = push_leaf_left(trans, root, path, 1, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h
index 9417713542a2..7dd1b6d0bf32 100644
--- a/trunk/fs/btrfs/ctree.h
+++ b/trunk/fs/btrfs/ctree.h
@@ -45,13 +45,6 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
-/*
- * files bigger than this get some pre-flushing when they are added
- * to the ordered operations list.  That way we limit the total
- * work done by the commit
- */
-#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
-
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 
@@ -408,16 +401,15 @@ struct btrfs_path {
 	int locks[BTRFS_MAX_LEVEL];
 	int reada;
 	/* keep some upper locks as we walk down */
+	int keep_locks;
+	int skip_locking;
 	int lowest_level;
 
 	/*
 	 * set by btrfs_split_item, tells search_slot to keep all locks
 	 * and to force calls to keep space in the nodes
 	 */
-	unsigned int search_for_split:1;
-	unsigned int keep_locks:1;
-	unsigned int skip_locking:1;
-	unsigned int leave_spinning:1;
+	int search_for_split;
 };
 
 /*
@@ -696,18 +688,15 @@ struct btrfs_fs_info {
 	struct rb_root block_group_cache_tree;
 
 	struct extent_io_tree pinned_extents;
+	struct extent_io_tree pending_del;
+	struct extent_io_tree extent_ins;
 
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
 
 	u64 generation;
 	u64 last_trans_committed;
-
-	/*
-	 * this is updated to the current trans every time a full commit
-	 * is required instead of the faster short fsync log commits
-	 */
-	u64 last_trans_log_full_commit;
+	u64 last_trans_new_blockgroup;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
@@ -728,21 +717,12 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
+	struct mutex extent_ins_mutex;
 	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
-
-	/*
-	 * this protects the ordered operations list only while we are
-	 * processing all of the entries on it.  This way we make
-	 * sure the commit code doesn't find the list temporarily empty
-	 * because another function happens to be doing non-waiting preflush
-	 * before jumping into the main commit.
-	 */
-	struct mutex ordered_operations_mutex;
-
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -757,28 +737,9 @@ struct btrfs_fs_info {
 	 * ordered extents
 	 */
 	spinlock_t ordered_extent_lock;
-
-	/*
-	 * all of the data=ordered extents pending writeback
-	 * these can span multiple transactions and basically include
-	 * every dirty data page that isn't from nodatacow
-	 */
 	struct list_head ordered_extents;
-
-	/*
-	 * all of the inodes that have delalloc bytes.  It is possible for
-	 * this list to be empty even when there is still dirty data=ordered
-	 * extents waiting to finish IO.
-	 */
 	struct list_head delalloc_inodes;
 
-	/*
-	 * special rename and truncate targets that must be on disk before
-	 * we're allowed to commit.  This is basically the ext3 style
-	 * data=ordered list.
-	 */
-	struct list_head ordered_operations;
-
 	/*
 	 * there is a pool of worker threads for checksumming during writes
 	 * and a pool for checksumming after reads.  This is because readers
@@ -820,11 +781,6 @@ struct btrfs_fs_info {
 	atomic_t throttle_gen;
 
 	u64 total_pinned;
-
-	/* protected by the delalloc lock, used to keep from writing
-	 * metadata until there is a nice batch
-	 */
-	u64 dirty_metadata_bytes;
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
@@ -1748,15 +1704,18 @@ static inline struct dentry *fdentry(struct file *file)
 }
 
 /* extent-tree.c */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 objectid, u64 bytenr);
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
@@ -1818,7 +1777,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 root_objectid, u64 ref_generation,
 			 u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+			    struct btrfs_root *root, u64 bytenr,
 			    u64 orig_parent, u64 parent,
 			    u64 root_objectid, u64 ref_generation,
 			    u64 owner_objectid);
@@ -1879,7 +1838,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret);
+		    struct extent_buffer **cow_ret, u64 prealloc_dest);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
diff --git a/trunk/fs/btrfs/delayed-ref.c b/trunk/fs/btrfs/delayed-ref.c
deleted file mode 100644
index cbf7dc8ae3ec..000000000000
--- a/trunk/fs/btrfs/delayed-ref.c
+++ /dev/null
@@ -1,669 +0,0 @@
-/*
- * Copyright (C) 2009 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/sched.h>
-#include <linux/sort.h>
-#include <linux/ftrace.h>
-#include "ctree.h"
-#include "delayed-ref.h"
-#include "transaction.h"
-
-/*
- * delayed back reference update tracking.  For subvolume trees
- * we queue up extent allocations and backref maintenance for
- * delayed processing.   This avoids deep call chains where we
- * add extents in the middle of btrfs_search_slot, and it allows
- * us to buffer up frequently modified backrefs in an rb tree instead
- * of hammering updates on the extent allocation tree.
- *
- * Right now this code is only used for reference counted trees, but
- * the long term goal is to get rid of the similar code for delayed
- * extent tree modifications.
- */
-
-/*
- * entries in the rb tree are ordered by the byte number of the extent
- * and by the byte number of the parent block.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref,
-		      u64 bytenr, u64 parent)
-{
-	if (bytenr < ref->bytenr)
-		return -1;
-	if (bytenr > ref->bytenr)
-		return 1;
-	if (parent < ref->parent)
-		return -1;
-	if (parent > ref->parent)
-		return 1;
-	return 0;
-}
-
-/*
- * insert a new ref into the rbtree.  This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-						  u64 bytenr, u64 parent,
-						  struct rb_node *node)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent_node = NULL;
-	struct btrfs_delayed_ref_node *entry;
-	int cmp;
-
-	while (*p) {
-		parent_node = *p;
-		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
-				 rb_node);
-
-		cmp = comp_entry(entry, bytenr, parent);
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else if (cmp > 0)
-			p = &(*p)->rb_right;
-		else
-			return entry;
-	}
-
-	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
-/*
- * find an entry based on (bytenr,parent).  This returns the delayed
- * ref if it was able to find one, or NULL if nothing was in that spot
- */
-static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
-				  u64 bytenr, u64 parent,
-				  struct btrfs_delayed_ref_node **last)
-{
-	struct rb_node *n = root->rb_node;
-	struct btrfs_delayed_ref_node *entry;
-	int cmp;
-
-	while (n) {
-		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
-		WARN_ON(!entry->in_tree);
-		if (last)
-			*last = entry;
-
-		cmp = comp_entry(entry, bytenr, parent);
-		if (cmp < 0)
-			n = n->rb_left;
-		else if (cmp > 0)
-			n = n->rb_right;
-		else
-			return entry;
-	}
-	return NULL;
-}
-
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
-			   struct btrfs_delayed_ref_head *head)
-{
-	struct btrfs_delayed_ref_root *delayed_refs;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	assert_spin_locked(&delayed_refs->lock);
-	if (mutex_trylock(&head->mutex))
-		return 0;
-
-	atomic_inc(&head->node.refs);
-	spin_unlock(&delayed_refs->lock);
-
-	mutex_lock(&head->mutex);
-	spin_lock(&delayed_refs->lock);
-	if (!head->node.in_tree) {
-		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(&head->node);
-		return -EAGAIN;
-	}
-	btrfs_put_delayed_ref(&head->node);
-	return 0;
-}
-
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-			   struct list_head *cluster, u64 start)
-{
-	int count = 0;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *head;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	if (start == 0) {
-		node = rb_first(&delayed_refs->root);
-	} else {
-		ref = NULL;
-		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
-		if (ref) {
-			struct btrfs_delayed_ref_node *tmp;
-
-			node = rb_prev(&ref->rb_node);
-			while (node) {
-				tmp = rb_entry(node,
-					       struct btrfs_delayed_ref_node,
-					       rb_node);
-				if (tmp->bytenr < start)
-					break;
-				ref = tmp;
-				node = rb_prev(&ref->rb_node);
-			}
-			node = &ref->rb_node;
-		} else
-			node = rb_first(&delayed_refs->root);
-	}
-again:
-	while (node && count < 32) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (btrfs_delayed_ref_is_head(ref)) {
-			head = btrfs_delayed_node_to_head(ref);
-			if (list_empty(&head->cluster)) {
-				list_add_tail(&head->cluster, cluster);
-				delayed_refs->run_delayed_start =
-					head->node.bytenr;
-				count++;
-
-				WARN_ON(delayed_refs->num_heads_ready == 0);
-				delayed_refs->num_heads_ready--;
-			} else if (count) {
-				/* the goal of the clustering is to find extents
-				 * that are likely to end up in the same extent
-				 * leaf on disk.  So, we don't want them spread
-				 * all over the tree.  Stop now if we've hit
-				 * a head that was already in use
-				 */
-				break;
-			}
-		}
-		node = rb_next(node);
-	}
-	if (count) {
-		return 0;
-	} else if (start) {
-		/*
-		 * we've gone to the end of the rbtree without finding any
-		 * clusters.  start from the beginning and try again
-		 */
-		start = 0;
-		node = rb_first(&delayed_refs->root);
-		goto again;
-	}
-	return 1;
-}
-
-/*
- * This checks to see if there are any delayed refs in the
- * btree for a given bytenr.  It returns one if it finds any
- * and zero otherwise.
- *
- * If it only finds a head node, it returns 0.
- *
- * The idea is to use this when deciding if you can safely delete an
- * extent from the extent allocation tree.  There may be a pending
- * ref in the rbtree that adds or removes references, so as long as this
- * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
- * allocation tree.
- */
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
-{
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *prev_node;
-	int ret = 0;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
-	if (ref) {
-		prev_node = rb_prev(&ref->rb_node);
-		if (!prev_node)
-			goto out;
-		ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
-			       rb_node);
-		if (ref->bytenr == bytenr)
-			ret = 1;
-	}
-out:
-	spin_unlock(&delayed_refs->lock);
-	return ret;
-}
-
-/*
- * helper function to lookup reference count
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree.  This way you
- * can check to see what the reference count would be if all of the
- * delayed refs are processed.
- */
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs)
-{
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *head;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_item *ei;
-	struct btrfs_key key;
-	u32 num_refs;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = num_bytes;
-	delayed_refs = &trans->transaction->delayed_refs;
-again:
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
-				&key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_extent_item);
-		num_refs = btrfs_extent_refs(leaf, ei);
-	} else {
-		num_refs = 0;
-		ret = 0;
-	}
-
-	spin_lock(&delayed_refs->lock);
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
-	if (ref) {
-		head = btrfs_delayed_node_to_head(ref);
-		if (mutex_trylock(&head->mutex)) {
-			num_refs += ref->ref_mod;
-			mutex_unlock(&head->mutex);
-			*refs = num_refs;
-			goto out;
-		}
-
-		atomic_inc(&ref->refs);
-		spin_unlock(&delayed_refs->lock);
-
-		btrfs_release_path(root->fs_info->extent_root, path);
-
-		mutex_lock(&head->mutex);
-		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(ref);
-		goto again;
-	} else {
-		*refs = num_refs;
-	}
-out:
-	spin_unlock(&delayed_refs->lock);
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
- * helper function to update an extent delayed ref in the
- * rbtree.  existing and update must both have the same
- * bytenr and parent
- *
- * This may free existing if the update cancels out whatever
- * operation it was doing.
- */
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
-		    struct btrfs_delayed_ref_root *delayed_refs,
-		    struct btrfs_delayed_ref_node *existing,
-		    struct btrfs_delayed_ref_node *update)
-{
-	struct btrfs_delayed_ref *existing_ref;
-	struct btrfs_delayed_ref *ref;
-
-	existing_ref = btrfs_delayed_node_to_ref(existing);
-	ref = btrfs_delayed_node_to_ref(update);
-
-	if (ref->pin)
-		existing_ref->pin = 1;
-
-	if (ref->action != existing_ref->action) {
-		/*
-		 * this is effectively undoing either an add or a
-		 * drop.  We decrement the ref_mod, and if it goes
-		 * down to zero we just delete the entry without
-		 * every changing the extent allocation tree.
-		 */
-		existing->ref_mod--;
-		if (existing->ref_mod == 0) {
-			rb_erase(&existing->rb_node,
-				 &delayed_refs->root);
-			existing->in_tree = 0;
-			btrfs_put_delayed_ref(existing);
-			delayed_refs->num_entries--;
-			if (trans->delayed_ref_updates)
-				trans->delayed_ref_updates--;
-		}
-	} else {
-		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
-			/* if we're adding refs, make sure all the
-			 * details match up.  The extent could
-			 * have been totally freed and reallocated
-			 * by a different owner before the delayed
-			 * ref entries were removed.
-			 */
-			existing_ref->owner_objectid = ref->owner_objectid;
-			existing_ref->generation = ref->generation;
-			existing_ref->root = ref->root;
-			existing->num_bytes = update->num_bytes;
-		}
-		/*
-		 * the action on the existing ref matches
-		 * the action on the ref we're trying to add.
-		 * Bump the ref_mod by one so the backref that
-		 * is eventually added/removed has the correct
-		 * reference count
-		 */
-		existing->ref_mod += update->ref_mod;
-	}
-}
-
-/*
- * helper function to update the accounting in the head ref
- * existing and update must have the same bytenr
- */
-static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
-			 struct btrfs_delayed_ref_node *update)
-{
-	struct btrfs_delayed_ref_head *existing_ref;
-	struct btrfs_delayed_ref_head *ref;
-
-	existing_ref = btrfs_delayed_node_to_head(existing);
-	ref = btrfs_delayed_node_to_head(update);
-
-	if (ref->must_insert_reserved) {
-		/* if the extent was freed and then
-		 * reallocated before the delayed ref
-		 * entries were processed, we can end up
-		 * with an existing head ref without
-		 * the must_insert_reserved flag set.
-		 * Set it again here
-		 */
-		existing_ref->must_insert_reserved = ref->must_insert_reserved;
-
-		/*
-		 * update the num_bytes so we make sure the accounting
-		 * is done correctly
-		 */
-		existing->num_bytes = update->num_bytes;
-
-	}
-
-	/*
-	 * update the reference mod on the head to reflect this new operation
-	 */
-	existing->ref_mod += update->ref_mod;
-}
-
-/*
- * helper function to actually insert a delayed ref into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count in the head node and properly dealing
- * with updating existing nodes as new modifications are queued.
- */
-static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  struct btrfs_delayed_ref_node *ref,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin)
-{
-	struct btrfs_delayed_ref_node *existing;
-	struct btrfs_delayed_ref *full_ref;
-	struct btrfs_delayed_ref_head *head_ref = NULL;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	int count_mod = 1;
-	int must_insert_reserved = 0;
-
-	/*
-	 * the head node stores the sum of all the mods, so dropping a ref
-	 * should drop the sum in the head node by one.
-	 */
-	if (parent == (u64)-1) {
-		if (action == BTRFS_DROP_DELAYED_REF)
-			count_mod = -1;
-		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
-			count_mod = 0;
-	}
-
-	/*
-	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
-	 * the reserved accounting when the extent is finally added, or
-	 * if a later modification deletes the delayed ref without ever
-	 * inserting the extent into the extent allocation tree.
-	 * ref->must_insert_reserved is the flag used to record
-	 * that accounting mods are required.
-	 *
-	 * Once we record must_insert_reserved, switch the action to
-	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
-	 */
-	if (action == BTRFS_ADD_DELAYED_EXTENT) {
-		must_insert_reserved = 1;
-		action = BTRFS_ADD_DELAYED_REF;
-	} else {
-		must_insert_reserved = 0;
-	}
-
-
-	delayed_refs = &trans->transaction->delayed_refs;
-
-	/* first set the basic ref node struct up */
-	atomic_set(&ref->refs, 1);
-	ref->bytenr = bytenr;
-	ref->parent = parent;
-	ref->ref_mod = count_mod;
-	ref->in_tree = 1;
-	ref->num_bytes = num_bytes;
-
-	if (btrfs_delayed_ref_is_head(ref)) {
-		head_ref = btrfs_delayed_node_to_head(ref);
-		head_ref->must_insert_reserved = must_insert_reserved;
-		INIT_LIST_HEAD(&head_ref->cluster);
-		mutex_init(&head_ref->mutex);
-	} else {
-		full_ref = btrfs_delayed_node_to_ref(ref);
-		full_ref->root = ref_root;
-		full_ref->generation = ref_generation;
-		full_ref->owner_objectid = owner_objectid;
-		full_ref->pin = pin;
-		full_ref->action = action;
-	}
-
-	existing = tree_insert(&delayed_refs->root, bytenr,
-			       parent, &ref->rb_node);
-
-	if (existing) {
-		if (btrfs_delayed_ref_is_head(ref))
-			update_existing_head_ref(existing, ref);
-		else
-			update_existing_ref(trans, delayed_refs, existing, ref);
-
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
-		kfree(ref);
-	} else {
-		if (btrfs_delayed_ref_is_head(ref)) {
-			delayed_refs->num_heads++;
-			delayed_refs->num_heads_ready++;
-		}
-		delayed_refs->num_entries++;
-		trans->delayed_ref_updates++;
-	}
-	return 0;
-}
-
-/*
- * add a delayed ref to the tree.  This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- */
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin)
-{
-	struct btrfs_delayed_ref *ref;
-	struct btrfs_delayed_ref_head *head_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	/*
-	 * the parent = 0 case comes from cases where we don't actually
-	 * know the parent yet.  It will get updated later via a add/drop
-	 * pair.
-	 */
-	if (parent == 0)
-		parent = bytenr;
-
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-	if (!head_ref) {
-		kfree(ref);
-		return -ENOMEM;
-	}
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-
-	/*
-	 * insert both the head node and the new ref without dropping
-	 * the spin lock
-	 */
-	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-				      (u64)-1, 0, 0, 0, action, pin);
-	BUG_ON(ret);
-
-	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-				      parent, ref_root, ref_generation,
-				      owner_objectid, action, pin);
-	BUG_ON(ret);
-	spin_unlock(&delayed_refs->lock);
-	return 0;
-}
-
-/*
- * this does a simple search for the head node for a given extent.
- * It must be called with the delayed ref spinlock held, and it returns
- * the head node if any where found, or NULL if not.
- */
-struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
-{
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
-	if (ref)
-		return btrfs_delayed_node_to_head(ref);
-	return NULL;
-}
-
-/*
- * add a delayed ref to the tree.  This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- *
- * The main point of this call is to add and remove a backreference in a single
- * shot, taking the lock only once, and only searching for the head node once.
- *
- * It is the same as doing a ref add and delete in two separate calls.
- */
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 orig_parent,
-			  u64 parent, u64 orig_ref_root, u64 ref_root,
-			  u64 orig_ref_generation, u64 ref_generation,
-			  u64 owner_objectid, int pin)
-{
-	struct btrfs_delayed_ref *ref;
-	struct btrfs_delayed_ref *old_ref;
-	struct btrfs_delayed_ref_head *head_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
-	if (!old_ref) {
-		kfree(ref);
-		return -ENOMEM;
-	}
-
-	/*
-	 * the parent = 0 case comes from cases where we don't actually
-	 * know the parent yet.  It will get updated later via a add/drop
-	 * pair.
-	 */
-	if (parent == 0)
-		parent = bytenr;
-	if (orig_parent == 0)
-		orig_parent = bytenr;
-
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-	if (!head_ref) {
-		kfree(ref);
-		kfree(old_ref);
-		return -ENOMEM;
-	}
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-
-	/*
-	 * insert both the head node and the new ref without dropping
-	 * the spin lock
-	 */
-	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-				      (u64)-1, 0, 0, 0,
-				      BTRFS_UPDATE_DELAYED_HEAD, 0);
-	BUG_ON(ret);
-
-	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-				      parent, ref_root, ref_generation,
-				      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
-	BUG_ON(ret);
-
-	ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
-				      orig_parent, orig_ref_root,
-				      orig_ref_generation, owner_objectid,
-				      BTRFS_DROP_DELAYED_REF, pin);
-	BUG_ON(ret);
-	spin_unlock(&delayed_refs->lock);
-	return 0;
-}
diff --git a/trunk/fs/btrfs/delayed-ref.h b/trunk/fs/btrfs/delayed-ref.h
deleted file mode 100644
index 3bec2ff0b15c..000000000000
--- a/trunk/fs/btrfs/delayed-ref.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __DELAYED_REF__
-#define __DELAYED_REF__
-
-/* these are the possible values of struct btrfs_delayed_ref->action */
-#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
-#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
-#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
-#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
-
-struct btrfs_delayed_ref_node {
-	struct rb_node rb_node;
-
-	/* the starting bytenr of the extent */
-	u64 bytenr;
-
-	/* the parent our backref will point to */
-	u64 parent;
-
-	/* the size of the extent */
-	u64 num_bytes;
-
-	/* ref count on this data structure */
-	atomic_t refs;
-
-	/*
-	 * how many refs is this entry adding or deleting.  For
-	 * head refs, this may be a negative number because it is keeping
-	 * track of the total mods done to the reference count.
-	 * For individual refs, this will always be a positive number
-	 *
-	 * It may be more than one, since it is possible for a single
-	 * parent to have more than one ref on an extent
-	 */
-	int ref_mod;
-
-	/* is this node still in the rbtree? */
-	unsigned int in_tree:1;
-};
-
-/*
- * the head refs are used to hold a lock on a given extent, which allows us
- * to make sure that only one process is running the delayed refs
- * at a time for a single extent.  They also store the sum of all the
- * reference count modifications we've queued up.
- */
-struct btrfs_delayed_ref_head {
-	struct btrfs_delayed_ref_node node;
-
-	/*
-	 * the mutex is held while running the refs, and it is also
-	 * held when checking the sum of reference modifications.
-	 */
-	struct mutex mutex;
-
-	struct list_head cluster;
-
-	/*
-	 * when a new extent is allocated, it is just reserved in memory
-	 * The actual extent isn't inserted into the extent allocation tree
-	 * until the delayed ref is processed.  must_insert_reserved is
-	 * used to flag a delayed ref so the accounting can be updated
-	 * when a full insert is done.
-	 *
-	 * It is possible the extent will be freed before it is ever
-	 * inserted into the extent allocation tree.  In this case
-	 * we need to update the in ram accounting to properly reflect
-	 * the free has happened.
-	 */
-	unsigned int must_insert_reserved:1;
-};
-
-struct btrfs_delayed_ref {
-	struct btrfs_delayed_ref_node node;
-
-	/* the root objectid our ref will point to */
-	u64 root;
-
-	/* the generation for the backref */
-	u64 generation;
-
-	/* owner_objectid of the backref  */
-	u64 owner_objectid;
-
-	/* operation done by this entry in the rbtree */
-	u8 action;
-
-	/* if pin == 1, when the extent is freed it will be pinned until
-	 * transaction commit
-	 */
-	unsigned int pin:1;
-};
-
-struct btrfs_delayed_ref_root {
-	struct rb_root root;
-
-	/* this spin lock protects the rbtree and the entries inside */
-	spinlock_t lock;
-
-	/* how many delayed ref updates we've queued, used by the
-	 * throttling code
-	 */
-	unsigned long num_entries;
-
-	/* total number of head nodes in tree */
-	unsigned long num_heads;
-
-	/* total number of head nodes ready for processing */
-	unsigned long num_heads_ready;
-
-	/*
-	 * set when the tree is flushing before a transaction commit,
-	 * used by the throttling code to decide if new updates need
-	 * to be run right away
-	 */
-	int flushing;
-
-	u64 run_delayed_start;
-};
-
-static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
-{
-	WARN_ON(atomic_read(&ref->refs) == 0);
-	if (atomic_dec_and_test(&ref->refs)) {
-		WARN_ON(ref->in_tree);
-		kfree(ref);
-	}
-}
-
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin);
-
-struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs);
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 orig_parent,
-			  u64 parent, u64 orig_ref_root, u64 ref_root,
-			  u64 orig_ref_generation, u64 ref_generation,
-			  u64 owner_objectid, int pin);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
-			   struct btrfs_delayed_ref_head *head);
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-			   struct list_head *cluster, u64 search_start);
-/*
- * a node might live in a head or a regular ref, this lets you
- * test for the proper type to use.
- */
-static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
-{
-	return node->parent == (u64)-1;
-}
-
-/*
- * helper functions to cast a node into its container
- */
-static inline struct btrfs_delayed_ref *
-btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
-{
-	WARN_ON(btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_ref, node);
-
-}
-
-static inline struct btrfs_delayed_ref_head *
-btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
-{
-	WARN_ON(!btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_ref_head, node);
-
-}
-#endif
diff --git a/trunk/fs/btrfs/dir-item.c b/trunk/fs/btrfs/dir-item.c
index 1d70236ba00c..926a0b287a7d 100644
--- a/trunk/fs/btrfs/dir-item.c
+++ b/trunk/fs/btrfs/dir-item.c
@@ -145,10 +145,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	key.offset = btrfs_name_hash(name, name_len);
-
 	path = btrfs_alloc_path();
-	path->leave_spinning = 1;
-
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c
index 92d73929d381..6ec80c0fc869 100644
--- a/trunk/fs/btrfs/disk-io.c
+++ b/trunk/fs/btrfs/disk-io.c
@@ -668,31 +668,14 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct extent_buffer *eb;
-	int was_dirty;
-
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	if (!(current->flags & PF_MEMALLOC)) {
-		return extent_write_full_page(tree, page,
-					      btree_get_extent, wbc);
-	}
-
-	redirty_page_for_writepage(wbc, page);
-	eb = btrfs_find_tree_block(root, page_offset(page),
-				      PAGE_CACHE_SIZE);
-	WARN_ON(!eb);
 
-	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
-		spin_unlock(&root->fs_info->delalloc_lock);
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
 	}
-	free_extent_buffer(eb);
-
-	unlock_page(page);
-	return 0;
+	return extent_write_full_page(tree, page, btree_get_extent, wbc);
 }
 
 static int btree_writepages(struct address_space *mapping,
@@ -701,15 +684,15 @@ static int btree_writepages(struct address_space *mapping,
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
-		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
 		u64 num_dirty;
+		u64 start = 0;
 		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
-		/* this is a bit racy, but that's ok */
-		num_dirty = root->fs_info->dirty_metadata_bytes;
+		num_dirty = count_range_bits(tree, &start, (u64)-1,
+					     thresh, EXTENT_DIRTY);
 		if (num_dirty < thresh)
 			return 0;
 	}
@@ -876,17 +859,9 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	    root->fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
-		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-			spin_lock(&root->fs_info->delalloc_lock);
-			if (root->fs_info->dirty_metadata_bytes >= buf->len)
-				root->fs_info->dirty_metadata_bytes -= buf->len;
-			else
-				WARN_ON(1);
-			spin_unlock(&root->fs_info->delalloc_lock);
-		}
-
-		/* ugh, clear_extent_buffer_dirty needs to lock the page */
+		/* ugh, clear_extent_buffer_dirty can be expensive */
 		btrfs_set_lock_blocking(buf);
+
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	}
@@ -1496,6 +1471,12 @@ static int transaction_kthread(void *arg)
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
+		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+			printk(KERN_INFO "btrfs: total reference cache "
+			       "size %llu\n",
+			       root->fs_info->total_ref_cache_size);
+		}
+
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
@@ -1512,7 +1493,6 @@ static int transaction_kthread(void *arg)
 		mutex_unlock(&root->fs_info->trans_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		ret = btrfs_commit_transaction(trans, root);
-
 sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1572,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
@@ -1632,6 +1611,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&fs_info->pending_del,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&fs_info->extent_ins,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
 	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1644,9 +1627,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	insert_inode_hash(fs_info->btree_inode);
 
 	mutex_init(&fs_info->trans_mutex);
-	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
+	mutex_init(&fs_info->extent_ins_mutex);
 	mutex_init(&fs_info->pinned_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2375,7 +2358,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	int was_dirty;
+
+	btrfs_set_lock_blocking(buf);
 
 	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
@@ -2386,13 +2370,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			(unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
-	was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-					    buf);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += buf->len;
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2432,7 +2410,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_buffer *eb;
 	unsigned long len;
@@ -2448,16 +2425,6 @@ int btree_lock_page_hook(struct page *page)
 
 	btrfs_tree_lock(eb);
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		if (root->fs_info->dirty_metadata_bytes >= eb->len)
-			root->fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
diff --git a/trunk/fs/btrfs/disk-io.h b/trunk/fs/btrfs/disk-io.h
index c958ecbc1916..95029db227be 100644
--- a/trunk/fs/btrfs/disk-io.h
+++ b/trunk/fs/btrfs/disk-io.h
@@ -72,7 +72,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c
index f5e7cae63d80..fefe83ad2059 100644
--- a/trunk/fs/btrfs/extent-tree.c
+++ b/trunk/fs/btrfs/extent-tree.c
@@ -49,23 +49,17 @@ struct pending_extent_op {
 	int del;
 };
 
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root, u64 parent,
-					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins,
-					 int ref_mod);
-static int update_reserved_extents(struct btrfs_root *root,
-				   u64 bytenr, u64 num, int reserve);
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u64 num_bytes, int is_data);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
-static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					u64 bytenr, u64 num_bytes, u64 parent,
-					u64 root_objectid, u64 ref_generation,
-					u64 owner_objectid, int pin,
-					int ref_to_drop);
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -560,13 +554,262 @@ static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * updates all the backrefs that are pending on update_list for the
+ * extent_root
+ */
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct list_head *update_list)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct pending_extent_op *op;
+	struct extent_buffer *leaf;
+	int ret = 0;
+	struct list_head *cur = update_list->next;
+	u64 ref_objectid;
+	u64 ref_root = extent_root->root_key.objectid;
+
+	op = list_entry(cur, struct pending_extent_op, list);
+
+search:
+	key.objectid = op->bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = op->orig_parent;
+
+	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+
+loop:
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
+
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
+	    (ref_objectid != op->level &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+		       "root %llu, owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)op->orig_parent,
+		       (unsigned long long)ref_root, op->level);
+		btrfs_print_leaf(extent_root, leaf);
+		BUG();
+	}
+
+	key.objectid = op->bytenr;
+	key.offset = op->parent;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
+	BUG_ON(ret);
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	btrfs_set_ref_generation(leaf, ref, op->generation);
+
+	cur = cur->next;
+
+	list_del_init(&op->list);
+	unlock_extent(&info->extent_ins, op->bytenr,
+		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+	kfree(op);
+
+	if (cur == update_list) {
+		btrfs_mark_buffer_dirty(path->nodes[0]);
+		btrfs_release_path(extent_root, path);
+		goto out;
+	}
+
+	op = list_entry(cur, struct pending_extent_op, list);
+
+	path->slots[0]++;
+	while (path->slots[0] < btrfs_header_nritems(leaf)) {
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid == op->bytenr &&
+		    key.type == BTRFS_EXTENT_REF_KEY)
+			goto loop;
+		path->slots[0]++;
+	}
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(extent_root, path);
+	goto search;
+
+out:
+	return 0;
+}
+
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *extent_root,
+				   struct btrfs_path *path,
+				   struct list_head *insert_list, int nr)
+{
+	struct btrfs_key *keys;
+	u32 *data_size;
+	struct pending_extent_op *op;
+	struct extent_buffer *leaf;
+	struct list_head *cur = insert_list->next;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	u64 ref_root = extent_root->root_key.objectid;
+	int i = 0, last = 0, ret;
+	int total = nr * 2;
+
+	if (!nr)
+		return 0;
+
+	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
+	if (!keys)
+		return -ENOMEM;
+
+	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
+	if (!data_size) {
+		kfree(keys);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(op, insert_list, list) {
+		keys[i].objectid = op->bytenr;
+		keys[i].offset = op->num_bytes;
+		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
+		data_size[i] = sizeof(struct btrfs_extent_item);
+		i++;
+
+		keys[i].objectid = op->bytenr;
+		keys[i].offset = op->parent;
+		keys[i].type = BTRFS_EXTENT_REF_KEY;
+		data_size[i] = sizeof(struct btrfs_extent_ref);
+		i++;
+	}
+
+	op = list_entry(cur, struct pending_extent_op, list);
+	i = 0;
+	while (i < total) {
+		int c;
+		ret = btrfs_insert_some_items(trans, extent_root, path,
+					      keys+i, data_size+i, total-i);
+		BUG_ON(ret < 0);
+
+		if (last && ret > 1)
+			BUG();
+
+		leaf = path->nodes[0];
+		for (c = 0; c < ret; c++) {
+			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
+
+			/*
+			 * if the first item we inserted was a backref, then
+			 * the EXTENT_ITEM will be the odd c's, else it will
+			 * be the even c's
+			 */
+			if ((ref_first && (c % 2)) ||
+			    (!ref_first && !(c % 2))) {
+				struct btrfs_extent_item *itm;
+
+				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
+						     struct btrfs_extent_item);
+				btrfs_set_extent_refs(path->nodes[0], itm, 1);
+				op->del++;
+			} else {
+				struct btrfs_extent_ref *ref;
+
+				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
+						     struct btrfs_extent_ref);
+				btrfs_set_ref_root(leaf, ref, ref_root);
+				btrfs_set_ref_generation(leaf, ref,
+							 op->generation);
+				btrfs_set_ref_objectid(leaf, ref, op->level);
+				btrfs_set_ref_num_refs(leaf, ref, 1);
+				op->del++;
+			}
+
+			/*
+			 * using del to see when its ok to free up the
+			 * pending_extent_op.  In the case where we insert the
+			 * last item on the list in order to help do batching
+			 * we need to not free the extent op until we actually
+			 * insert the extent_item
+			 */
+			if (op->del == 2) {
+				unlock_extent(&info->extent_ins, op->bytenr,
+					      op->bytenr + op->num_bytes - 1,
+					      GFP_NOFS);
+				cur = cur->next;
+				list_del_init(&op->list);
+				kfree(op);
+				if (cur != insert_list)
+					op = list_entry(cur,
+						struct pending_extent_op,
+						list);
+			}
+		}
+		btrfs_mark_buffer_dirty(leaf);
+		btrfs_release_path(extent_root, path);
+
+		/*
+		 * Ok backref's and items usually go right next to eachother,
+		 * but if we could only insert 1 item that means that we
+		 * inserted on the end of a leaf, and we have no idea what may
+		 * be on the next leaf so we just play it safe.  In order to
+		 * try and help this case we insert the last thing on our
+		 * insert list so hopefully it will end up being the last
+		 * thing on the leaf and everything else will be before it,
+		 * which will let us insert a whole bunch of items at the same
+		 * time.
+		 */
+		if (ret == 1 && !last && (i + ret < total)) {
+			/*
+			 * last: where we will pick up the next time around
+			 * i: our current key to insert, will be total - 1
+			 * cur: the current op we are screwing with
+			 * op: duh
+			 */
+			last = i + ret;
+			i = total - 1;
+			cur = insert_list->prev;
+			op = list_entry(cur, struct pending_extent_op, list);
+		} else if (last) {
+			/*
+			 * ok we successfully inserted the last item on the
+			 * list, lets reset everything
+			 *
+			 * i: our current key to insert, so where we left off
+			 *    last time
+			 * last: done with this
+			 * cur: the op we are messing with
+			 * op: duh
+			 * total: since we inserted the last key, we need to
+			 *        decrement total so we dont overflow
+			 */
+			i = last;
+			last = 0;
+			total--;
+			if (i < total) {
+				cur = insert_list->next;
+				op = list_entry(cur, struct pending_extent_op,
+						list);
+			}
+		} else {
+			i += ret;
+		}
+
+		cond_resched();
+	}
+	ret = 0;
+	kfree(keys);
+	kfree(data_size);
+	return ret;
+}
+
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
 					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid,
-					  int refs_to_add)
+					  u64 owner_objectid)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -586,10 +829,9 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_root(leaf, ref, ref_root);
 		btrfs_set_ref_generation(leaf, ref, ref_generation);
 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
+		btrfs_set_ref_num_refs(leaf, ref, 1);
 	} else if (ret == -EEXIST) {
 		u64 existing_owner;
-
 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
 		leaf = path->nodes[0];
 		ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -603,7 +845,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 
 		num_refs = btrfs_ref_num_refs(leaf, ref);
 		BUG_ON(num_refs == 0);
-		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
 
 		existing_owner = btrfs_ref_objectid(leaf, ref);
 		if (existing_owner != owner_objectid &&
@@ -615,7 +857,6 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 	} else {
 		goto out;
 	}
-	btrfs_unlock_up_safe(path, 1);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
@@ -624,8 +865,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  int refs_to_drop)
+					  struct btrfs_path *path)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_ref *ref;
@@ -635,8 +875,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
 	num_refs = btrfs_ref_num_refs(leaf, ref);
-	BUG_ON(num_refs < refs_to_drop);
-	num_refs -= refs_to_drop;
+	BUG_ON(num_refs == 0);
+	num_refs -= 1;
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
@@ -687,28 +927,332 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
+static noinline int free_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root,
+				 struct list_head *del_list)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key key, found_key;
+	struct extent_buffer *leaf;
+	struct list_head *cur;
+	struct pending_extent_op *op;
+	struct btrfs_extent_item *ei;
+	int ret, num_to_del, extent_slot = 0, found_extent = 0;
+	u32 refs;
+	u64 bytes_freed = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+search:
+	/* search for the backref for the current ref we want to delete */
+	cur = del_list->next;
+	op = list_entry(cur, struct pending_extent_op, list);
+	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
+				    op->orig_parent,
+				    extent_root->root_key.objectid,
+				    op->orig_generation, op->level, 1);
+	if (ret) {
+		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+		       "root %llu gen %llu owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)extent_root->root_key.objectid,
+		       (unsigned long long)op->orig_generation, op->level);
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		WARN_ON(1);
+		goto out;
+	}
+
+	extent_slot = path->slots[0];
+	num_to_del = 1;
+	found_extent = 0;
+
+	/*
+	 * if we aren't the first item on the leaf we can move back one and see
+	 * if our ref is right next to our extent item
+	 */
+	if (likely(extent_slot)) {
+		extent_slot--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      extent_slot);
+		if (found_key.objectid == op->bytenr &&
+		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+		    found_key.offset == op->num_bytes) {
+			num_to_del++;
+			found_extent = 1;
+		}
+	}
+
+	/*
+	 * if we didn't find the extent we need to delete the backref and then
+	 * search for the extent item key so we can update its ref count
+	 */
+	if (!found_extent) {
+		key.objectid = op->bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = op->num_bytes;
+
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
+		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+	}
+
+	/* this is where we update the ref count for the extent */
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	BUG_ON(refs == 0);
+	refs--;
+	btrfs_set_extent_refs(leaf, ei, refs);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	/*
+	 * This extent needs deleting.  The reason cur_slot is extent_slot +
+	 * num_to_del is because extent_slot points to the slot where the extent
+	 * is, and if the backref was not right next to the extent we will be
+	 * deleting at least 1 item, and will want to start searching at the
+	 * slot directly next to extent_slot.  However if we did find the
+	 * backref next to the extent item them we will be deleting at least 2
+	 * items and will want to start searching directly after the ref slot
+	 */
+	if (!refs) {
+		struct list_head *pos, *n, *end;
+		int cur_slot = extent_slot+num_to_del;
+		u64 super_used;
+		u64 root_used;
+
+		path->slots[0] = extent_slot;
+		bytes_freed = op->num_bytes;
+
+		mutex_lock(&info->pinned_mutex);
+		ret = pin_down_bytes(trans, extent_root, op->bytenr,
+				     op->num_bytes, op->level >=
+				     BTRFS_FIRST_FREE_OBJECTID);
+		mutex_unlock(&info->pinned_mutex);
+		BUG_ON(ret < 0);
+		op->del = ret;
+
+		/*
+		 * we need to see if we can delete multiple things at once, so
+		 * start looping through the list of extents we are wanting to
+		 * delete and see if their extent/backref's are right next to
+		 * eachother and the extents only have 1 ref
+		 */
+		for (pos = cur->next; pos != del_list; pos = pos->next) {
+			struct pending_extent_op *tmp;
+
+			tmp = list_entry(pos, struct pending_extent_op, list);
+
+			/* we only want to delete extent+ref at this stage */
+			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
+				break;
+
+			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
+			if (found_key.objectid != tmp->bytenr ||
+			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
+			    found_key.offset != tmp->num_bytes)
+				break;
+
+			/* check to make sure this extent only has one ref */
+			ei = btrfs_item_ptr(leaf, cur_slot,
+					    struct btrfs_extent_item);
+			if (btrfs_extent_refs(leaf, ei) != 1)
+				break;
+
+			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
+			if (found_key.objectid != tmp->bytenr ||
+			    found_key.type != BTRFS_EXTENT_REF_KEY ||
+			    found_key.offset != tmp->orig_parent)
+				break;
+
+			/*
+			 * the ref is right next to the extent, we can set the
+			 * ref count to 0 since we will delete them both now
+			 */
+			btrfs_set_extent_refs(leaf, ei, 0);
+
+			/* pin down the bytes for this extent */
+			mutex_lock(&info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
+					     tmp->num_bytes, tmp->level >=
+					     BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&info->pinned_mutex);
+			BUG_ON(ret < 0);
+
+			/*
+			 * use the del field to tell if we need to go ahead and
+			 * free up the extent when we delete the item or not.
+			 */
+			tmp->del = ret;
+			bytes_freed += tmp->num_bytes;
+
+			num_to_del += 2;
+			cur_slot += 2;
+		}
+		end = pos;
+
+		/* update the free space counters */
+		spin_lock(&info->delalloc_lock);
+		super_used = btrfs_super_bytes_used(&info->super_copy);
+		btrfs_set_super_bytes_used(&info->super_copy,
+					   super_used - bytes_freed);
+
+		root_used = btrfs_root_used(&extent_root->root_item);
+		btrfs_set_root_used(&extent_root->root_item,
+				    root_used - bytes_freed);
+		spin_unlock(&info->delalloc_lock);
+
+		/* delete the items */
+		ret = btrfs_del_items(trans, extent_root, path,
+				      path->slots[0], num_to_del);
+		BUG_ON(ret);
+
+		/*
+		 * loop through the extents we deleted and do the cleanup work
+		 * on them
+		 */
+		for (pos = cur, n = pos->next; pos != end;
+		     pos = n, n = pos->next) {
+			struct pending_extent_op *tmp;
+			tmp = list_entry(pos, struct pending_extent_op, list);
+
+			/*
+			 * remember tmp->del tells us wether or not we pinned
+			 * down the extent
+			 */
+			ret = update_block_group(trans, extent_root,
+						 tmp->bytenr, tmp->num_bytes, 0,
+						 tmp->del);
+			BUG_ON(ret);
+
+			list_del_init(&tmp->list);
+			unlock_extent(&info->extent_ins, tmp->bytenr,
+				      tmp->bytenr + tmp->num_bytes - 1,
+				      GFP_NOFS);
+			kfree(tmp);
+		}
+	} else if (refs && found_extent) {
+		/*
+		 * the ref and extent were right next to eachother, but the
+		 * extent still has a ref, so just free the backref and keep
+		 * going
+		 */
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+
+		list_del_init(&op->list);
+		unlock_extent(&info->extent_ins, op->bytenr,
+			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+		kfree(op);
+	} else {
+		/*
+		 * the extent has multiple refs and the backref we were looking
+		 * for was not right next to it, so just unlock and go next,
+		 * we're good to go
+		 */
+		list_del_init(&op->list);
+		unlock_extent(&info->extent_ins, op->bytenr,
+			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+		kfree(op);
+	}
+
+	btrfs_release_path(extent_root, path);
+	if (!list_empty(del_list))
+		goto search;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 bytenr,
-				     u64 num_bytes,
 				     u64 orig_parent, u64 parent,
 				     u64 orig_root, u64 ref_root,
 				     u64 orig_generation, u64 ref_generation,
 				     u64 owner_objectid)
 {
 	int ret;
-	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+
+	if (root == root->fs_info->extent_root) {
+		struct pending_extent_op *extent_op;
+		u64 num_bytes;
+
+		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+							(unsigned long)priv;
+			BUG_ON(extent_op->parent != orig_parent);
+			BUG_ON(extent_op->generation != orig_generation);
 
-	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
-				       orig_parent, parent, orig_root,
-				       ref_root, orig_generation,
-				       ref_generation, owner_objectid, pin);
+			extent_op->parent = parent;
+			extent_op->generation = ref_generation;
+		} else {
+			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+			BUG_ON(!extent_op);
+
+			extent_op->type = PENDING_BACKREF_UPDATE;
+			extent_op->bytenr = bytenr;
+			extent_op->num_bytes = num_bytes;
+			extent_op->parent = parent;
+			extent_op->orig_parent = orig_parent;
+			extent_op->generation = ref_generation;
+			extent_op->orig_generation = orig_generation;
+			extent_op->level = (int)owner_objectid;
+			INIT_LIST_HEAD(&extent_op->list);
+			extent_op->del = 0;
+
+			set_extent_bits(&root->fs_info->extent_ins,
+					bytenr, bytenr + num_bytes - 1,
+					EXTENT_WRITEBACK, GFP_NOFS);
+			set_state_private(&root->fs_info->extent_ins,
+					  bytenr, (unsigned long)extent_op);
+		}
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, orig_parent, orig_root,
+				    orig_generation, owner_objectid, 1);
+	if (ret)
+		goto out;
+	ret = remove_extent_backref(trans, extent_root, path);
+	if (ret)
+		goto out;
+	ret = insert_extent_backref(trans, extent_root, path, bytenr,
+				    parent, ref_root, ref_generation,
+				    owner_objectid);
 	BUG_ON(ret);
+	finish_current_insert(trans, extent_root, 0);
+	del_pending_extents(trans, extent_root, 0);
+out:
+	btrfs_free_path(path);
 	return ret;
 }
 
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u64 orig_parent, u64 parent,
+			    u64 orig_parent, u64 parent,
 			    u64 ref_root, u64 ref_generation,
 			    u64 owner_objectid)
 {
@@ -716,35 +1260,19 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-
-	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-					orig_parent, parent, ref_root,
-					ref_root, ref_generation,
-					ref_generation, owner_objectid);
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+					parent, ref_root, ref_root,
+					ref_generation, ref_generation,
+					owner_objectid);
 	return ret;
 }
+
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root, u64 bytenr,
-				  u64 num_bytes,
 				  u64 orig_parent, u64 parent,
 				  u64 orig_root, u64 ref_root,
 				  u64 orig_generation, u64 ref_generation,
 				  u64 owner_objectid)
-{
-	int ret;
-
-	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
-				    ref_generation, owner_objectid,
-				    BTRFS_ADD_DELAYED_REF, 0);
-	BUG_ON(ret);
-	return ret;
-}
-
-static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 bytenr,
-			  u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid,
-			  int refs_to_add)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -758,24 +1286,17 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
-	path->leave_spinning = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = num_bytes;
+	key.offset = (u64)-1;
 
-	/* first find the extent item and update its reference count */
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
-				path, 0, 1);
-	if (ret < 0) {
-		btrfs_set_path_blocking(path);
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+				0, 1);
+	if (ret < 0)
 		return ret;
-	}
+	BUG_ON(ret == 0 || path->slots[0] == 0);
 
-	if (ret > 0) {
-		WARN_ON(1);
-		btrfs_free_path(path);
-		return -EIO;
-	}
+	path->slots[0]--;
 	l = path->nodes[0];
 
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -789,24 +1310,21 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-
 	refs = btrfs_extent_refs(l, item);
-	btrfs_set_extent_refs(l, item, refs + refs_to_add);
-	btrfs_unlock_up_safe(path, 1);
-
+	btrfs_set_extent_refs(l, item, refs + 1);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
-	path->leave_spinning = 1;
-
-	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent,
 				    ref_root, ref_generation,
-				    owner_objectid, refs_to_add);
+				    owner_objectid);
 	BUG_ON(ret);
+	finish_current_insert(trans, root->fs_info->extent_root, 0);
+	del_pending_extents(trans, root->fs_info->extent_root, 0);
+
 	btrfs_free_path(path);
 	return 0;
 }
@@ -821,278 +1339,68 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+				     0, ref_root, 0, ref_generation,
+				     owner_objectid);
+	return ret;
+}
 
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
-				     0, ref_root, 0, ref_generation,
-				     owner_objectid);
-	return ret;
-}
-
-static int drop_delayed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_delayed_ref_node *node)
-{
-	int ret = 0;
-	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
-
-	BUG_ON(node->ref_mod == 0);
-	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
-				  node->parent, ref->root, ref->generation,
-				  ref->owner_objectid, ref->pin, node->ref_mod);
-
-	return ret;
-}
-
-/* helper function to actually process a single delayed ref entry */
-static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_delayed_ref_node *node,
-					int insert_reserved)
-{
-	int ret;
-	struct btrfs_delayed_ref *ref;
-
-	if (node->parent == (u64)-1) {
-		struct btrfs_delayed_ref_head *head;
-		/*
-		 * we've hit the end of the chain and we were supposed
-		 * to insert this extent into the tree.  But, it got
-		 * deleted before we ever needed to insert it, so all
-		 * we have to do is clean up the accounting
-		 */
-		if (insert_reserved) {
-			update_reserved_extents(root, node->bytenr,
-						node->num_bytes, 0);
-		}
-		head = btrfs_delayed_node_to_head(node);
-		mutex_unlock(&head->mutex);
-		return 0;
-	}
-
-	ref = btrfs_delayed_node_to_ref(node);
-	if (ref->action == BTRFS_ADD_DELAYED_REF) {
-		if (insert_reserved) {
-			struct btrfs_key ins;
-
-			ins.objectid = node->bytenr;
-			ins.offset = node->num_bytes;
-			ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-			/* record the full extent allocation */
-			ret = __btrfs_alloc_reserved_extent(trans, root,
-					node->parent, ref->root,
-					ref->generation, ref->owner_objectid,
-					&ins, node->ref_mod);
-			update_reserved_extents(root, node->bytenr,
-						node->num_bytes, 0);
-		} else {
-			/* just add one backref */
-			ret = add_extent_ref(trans, root, node->bytenr,
-				     node->num_bytes,
-				     node->parent, ref->root, ref->generation,
-				     ref->owner_objectid, node->ref_mod);
-		}
-		BUG_ON(ret);
-	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
-		WARN_ON(insert_reserved);
-		ret = drop_delayed_ref(trans, root, node);
-	}
-	return 0;
-}
-
-static noinline struct btrfs_delayed_ref_node *
-select_delayed_ref(struct btrfs_delayed_ref_head *head)
-{
-	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref;
-	int action = BTRFS_ADD_DELAYED_REF;
-again:
-	/*
-	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
-	 * this prevents ref count from going down to zero when
-	 * there still are pending delayed ref.
-	 */
-	node = rb_prev(&head->node.rb_node);
-	while (1) {
-		if (!node)
-			break;
-		ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				rb_node);
-		if (ref->bytenr != head->node.bytenr)
-			break;
-		if (btrfs_delayed_node_to_ref(ref)->action == action)
-			return ref;
-		node = rb_prev(node);
-	}
-	if (action == BTRFS_ADD_DELAYED_REF) {
-		action = BTRFS_DROP_DELAYED_REF;
-		goto again;
-	}
-	return NULL;
-}
-
-static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct list_head *cluster)
-{
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *locked_ref = NULL;
-	int ret;
-	int count = 0;
-	int must_insert_reserved = 0;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	while (1) {
-		if (!locked_ref) {
-			/* pick a new head ref from the cluster list */
-			if (list_empty(cluster))
-				break;
-
-			locked_ref = list_entry(cluster->next,
-				     struct btrfs_delayed_ref_head, cluster);
-
-			/* grab the lock that says we are going to process
-			 * all the refs for this head */
-			ret = btrfs_delayed_ref_lock(trans, locked_ref);
-
-			/*
-			 * we may have dropped the spin lock to get the head
-			 * mutex lock, and that might have given someone else
-			 * time to free the head.  If that's true, it has been
-			 * removed from our list and we can move on.
-			 */
-			if (ret == -EAGAIN) {
-				locked_ref = NULL;
-				count++;
-				continue;
-			}
-		}
-
-		/*
-		 * record the must insert reserved flag before we
-		 * drop the spin lock.
-		 */
-		must_insert_reserved = locked_ref->must_insert_reserved;
-		locked_ref->must_insert_reserved = 0;
-
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
-		ref = select_delayed_ref(locked_ref);
-		if (!ref) {
-			/* All delayed refs have been processed, Go ahead
-			 * and send the head node to run_one_delayed_ref,
-			 * so that any accounting fixes can happen
-			 */
-			ref = &locked_ref->node;
-			list_del_init(&locked_ref->cluster);
-			locked_ref = NULL;
-		}
-
-		ref->in_tree = 0;
-		rb_erase(&ref->rb_node, &delayed_refs->root);
-		delayed_refs->num_entries--;
-		spin_unlock(&delayed_refs->lock);
-
-		ret = run_one_delayed_ref(trans, root, ref,
-					  must_insert_reserved);
-		BUG_ON(ret);
-		btrfs_put_delayed_ref(ref);
-
-		count++;
-		cond_resched();
-		spin_lock(&delayed_refs->lock);
-	}
-	return count;
-}
-
-/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
- */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long count)
-{
-	struct rb_node *node;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct list_head cluster;
-	int ret;
-	int run_all = count == (unsigned long)-1;
-	int run_most = 0;
-
-	if (root == root->fs_info->extent_root)
-		root = root->fs_info->tree_root;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	INIT_LIST_HEAD(&cluster);
-again:
-	spin_lock(&delayed_refs->lock);
-	if (count == 0) {
-		count = delayed_refs->num_entries * 2;
-		run_most = 1;
-	}
-	while (1) {
-		if (!(run_all || run_most) &&
-		    delayed_refs->num_heads_ready < 64)
-			break;
-
-		/*
-		 * go find something we can process in the rbtree.  We start at
-		 * the beginning of the tree, and then build a cluster
-		 * of refs to process starting at the first one we are able to
-		 * lock
-		 */
-		ret = btrfs_find_ref_cluster(trans, &cluster,
-					     delayed_refs->run_delayed_start);
-		if (ret)
-			break;
-
-		ret = run_clustered_refs(trans, root, &cluster);
-		BUG_ON(ret < 0);
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root)
+{
+	u64 start;
+	u64 end;
+	int ret;
 
-		count -= min_t(unsigned long, ret, count);
+	while(1) {
+		finish_current_insert(trans, root->fs_info->extent_root, 1);
+		del_pending_extents(trans, root->fs_info->extent_root, 1);
 
-		if (count == 0)
-			break;
+		/* is there more work to do? */
+		ret = find_first_extent_bit(&root->fs_info->pending_del,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		ret = find_first_extent_bit(&root->fs_info->extent_ins,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		break;
 	}
+	return 0;
+}
 
-	if (run_all) {
-		node = rb_first(&delayed_refs->root);
-		if (!node)
-			goto out;
-		count = (unsigned long)-1;
-
-		while (node) {
-			ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				       rb_node);
-			if (btrfs_delayed_ref_is_head(ref)) {
-				struct btrfs_delayed_ref_head *head;
-
-				head = btrfs_delayed_node_to_head(ref);
-				atomic_inc(&ref->refs);
-
-				spin_unlock(&delayed_refs->lock);
-				mutex_lock(&head->mutex);
-				mutex_unlock(&head->mutex);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
 
-				btrfs_put_delayed_ref(ref);
-				cond_resched();
-				goto again;
-			}
-			node = rb_next(node);
-		}
-		spin_unlock(&delayed_refs->lock);
-		schedule_timeout(1);
-		goto again;
+	WARN_ON(num_bytes < root->sectorsize);
+	path = btrfs_alloc_path();
+	path->reada = 1;
+	key.objectid = bytenr;
+	key.offset = num_bytes;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+				0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		printk(KERN_INFO "btrfs failed to find block number %llu\n",
+		       (unsigned long long)bytenr);
+		BUG();
 	}
+	l = path->nodes[0];
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+	*refs = btrfs_extent_refs(l, item);
 out:
-	spin_unlock(&delayed_refs->lock);
+	btrfs_free_path(path);
 	return 0;
 }
 
@@ -1316,7 +1624,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 	int refi = 0;
 	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
 	ref_generation = btrfs_header_generation(buf);
@@ -1388,19 +1696,12 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 
 		if (level == 0) {
 			btrfs_item_key_to_cpu(buf, &key, slot);
-			fi = btrfs_item_ptr(buf, slot,
-					    struct btrfs_file_extent_item);
-
-			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (bytenr == 0)
-				continue;
 
 			ret = process_func(trans, root, bytenr,
-				   btrfs_file_extent_disk_num_bytes(buf, fi),
-				   orig_buf->start, buf->start,
-				   orig_root, ref_root,
-				   orig_generation, ref_generation,
-				   key.objectid);
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   key.objectid);
 
 			if (ret) {
 				faili = slot;
@@ -1408,7 +1709,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 				goto fail;
 			}
 		} else {
-			ret = process_func(trans, root, bytenr, buf->len,
+			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
@@ -1485,17 +1786,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			if (bytenr == 0)
 				continue;
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf, fi),
-				    orig_buf->start, buf->start,
-				    orig_root, ref_root, orig_generation,
-				    ref_generation, key.objectid);
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    key.objectid);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, slot);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    buf->len, orig_buf->start,
-					    buf->start, orig_root, ref_root,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    level - 1);
 			if (ret)
@@ -1514,6 +1815,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 				 struct btrfs_block_group_cache *cache)
 {
 	int ret;
+	int pending_ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	unsigned long bi;
 	struct extent_buffer *leaf;
@@ -1529,8 +1831,12 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(extent_root, path);
 fail:
+	finish_current_insert(trans, extent_root, 0);
+	pending_ret = del_pending_extents(trans, extent_root, 0);
 	if (ret)
 		return ret;
+	if (pending_ret)
+		return pending_ret;
 	return 0;
 
 }
@@ -2055,8 +2361,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 		clear_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
 	}
-	mutex_unlock(&root->fs_info->pinned_mutex);
-
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
 		BUG_ON(!cache);
@@ -2148,8 +2452,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 end;
 	int ret;
 
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while (1) {
-		mutex_lock(&root->fs_info->pinned_mutex);
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
@@ -2157,21 +2461,209 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_discard_extent(root, start, end + 1 - start);
 
-		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
-		cond_resched();
+		if (need_resched()) {
+			mutex_unlock(&root->fs_info->pinned_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->pinned_mutex);
+		}
 	}
 	mutex_unlock(&root->fs_info->pinned_mutex);
 	return ret;
 }
 
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all)
+{
+	u64 start;
+	u64 end;
+	u64 priv;
+	u64 search = 0;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_path *path;
+	struct pending_extent_op *extent_op, *tmp;
+	struct list_head insert_list, update_list;
+	int ret;
+	int num_inserts = 0, max_inserts, restart = 0;
+
+	path = btrfs_alloc_path();
+	INIT_LIST_HEAD(&insert_list);
+	INIT_LIST_HEAD(&update_list);
+
+	max_inserts = extent_root->leafsize /
+		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
+		 sizeof(struct btrfs_extent_ref) +
+		 sizeof(struct btrfs_extent_item));
+again:
+	mutex_lock(&info->extent_ins_mutex);
+	while (1) {
+		ret = find_first_extent_bit(&info->extent_ins, search, &start,
+					    &end, EXTENT_WRITEBACK);
+		if (ret) {
+			if (restart && !num_inserts &&
+			    list_empty(&update_list)) {
+				restart = 0;
+				search = 0;
+				continue;
+			}
+			break;
+		}
+
+		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			if (all)
+				restart = 1;
+			search = end + 1;
+			if (need_resched()) {
+				mutex_unlock(&info->extent_ins_mutex);
+				cond_resched();
+				mutex_lock(&info->extent_ins_mutex);
+			}
+			continue;
+		}
+
+		ret = get_state_private(&info->extent_ins, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long) priv;
+
+		if (extent_op->type == PENDING_EXTENT_INSERT) {
+			num_inserts++;
+			list_add_tail(&extent_op->list, &insert_list);
+			search = end + 1;
+			if (num_inserts == max_inserts) {
+				restart = 1;
+				break;
+			}
+		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+			list_add_tail(&extent_op->list, &update_list);
+			search = end + 1;
+		} else {
+			BUG();
+		}
+	}
+
+	/*
+	 * process the update list, clear the writeback bit for it, and if
+	 * somebody marked this thing for deletion then just unlock it and be
+	 * done, the free_extents will handle it
+	 */
+	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
+		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+				  extent_op->bytenr + extent_op->num_bytes - 1,
+				  EXTENT_WRITEBACK, GFP_NOFS);
+		if (extent_op->del) {
+			list_del_init(&extent_op->list);
+			unlock_extent(&info->extent_ins, extent_op->bytenr,
+				      extent_op->bytenr + extent_op->num_bytes
+				      - 1, GFP_NOFS);
+			kfree(extent_op);
+		}
+	}
+	mutex_unlock(&info->extent_ins_mutex);
+
+	/*
+	 * still have things left on the update list, go ahead an update
+	 * everything
+	 */
+	if (!list_empty(&update_list)) {
+		ret = update_backrefs(trans, extent_root, path, &update_list);
+		BUG_ON(ret);
+
+		/* we may have COW'ed new blocks, so lets start over */
+		if (all)
+			restart = 1;
+	}
+
+	/*
+	 * if no inserts need to be done, but we skipped some extents and we
+	 * need to make sure everything is cleaned then reset everything and
+	 * go back to the beginning
+	 */
+	if (!num_inserts && restart) {
+		search = 0;
+		restart = 0;
+		INIT_LIST_HEAD(&update_list);
+		INIT_LIST_HEAD(&insert_list);
+		goto again;
+	} else if (!num_inserts) {
+		goto out;
+	}
+
+	/*
+	 * process the insert extents list.  Again if we are deleting this
+	 * extent, then just unlock it, pin down the bytes if need be, and be
+	 * done with it.  Saves us from having to actually insert the extent
+	 * into the tree and then subsequently come along and delete it
+	 */
+	mutex_lock(&info->extent_ins_mutex);
+	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
+		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+				  extent_op->bytenr + extent_op->num_bytes - 1,
+				  EXTENT_WRITEBACK, GFP_NOFS);
+		if (extent_op->del) {
+			u64 used;
+			list_del_init(&extent_op->list);
+			unlock_extent(&info->extent_ins, extent_op->bytenr,
+				      extent_op->bytenr + extent_op->num_bytes
+				      - 1, GFP_NOFS);
+
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root,
+					     extent_op->bytenr,
+					     extent_op->num_bytes, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+			spin_lock(&info->delalloc_lock);
+			used = btrfs_super_bytes_used(&info->super_copy);
+			btrfs_set_super_bytes_used(&info->super_copy,
+					used - extent_op->num_bytes);
+			used = btrfs_root_used(&extent_root->root_item);
+			btrfs_set_root_used(&extent_root->root_item,
+					used - extent_op->num_bytes);
+			spin_unlock(&info->delalloc_lock);
+
+			ret = update_block_group(trans, extent_root,
+						 extent_op->bytenr,
+						 extent_op->num_bytes,
+						 0, ret > 0);
+			BUG_ON(ret);
+			kfree(extent_op);
+			num_inserts--;
+		}
+	}
+	mutex_unlock(&info->extent_ins_mutex);
+
+	ret = insert_extents(trans, extent_root, path, &insert_list,
+			     num_inserts);
+	BUG_ON(ret);
+
+	/*
+	 * if restart is set for whatever reason we need to go back and start
+	 * searching through the pending list again.
+	 *
+	 * We just inserted some extents, which could have resulted in new
+	 * blocks being allocated, which would result in new blocks needing
+	 * updates, so if all is set we _must_ restart to get the updated
+	 * blocks.
+	 */
+	if (restart || all) {
+		INIT_LIST_HEAD(&insert_list);
+		INIT_LIST_HEAD(&update_list);
+		search = 0;
+		restart = 0;
+		num_inserts = 0;
+		goto again;
+	}
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
-			  struct btrfs_path *path,
-			  u64 bytenr, u64 num_bytes, int is_data,
-			  struct extent_buffer **must_clean)
+			  u64 bytenr, u64 num_bytes, int is_data)
 {
 	int err = 0;
 	struct extent_buffer *buf;
@@ -2194,19 +2686,17 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
-		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-			*must_clean = buf;
+			clean_tree_block(NULL, root, buf);
+			btrfs_tree_unlock(buf);
+			free_extent_buffer(buf);
 			return 1;
 		}
 		btrfs_tree_unlock(buf);
 	}
 	free_extent_buffer(buf);
 pinit:
-	btrfs_set_path_blocking(path);
-	mutex_lock(&root->fs_info->pinned_mutex);
-	/* unlocks the pinned mutex */
 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
 	BUG_ON(err < 0);
@@ -2220,8 +2710,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, int pin, int mark_free,
-			 int refs_to_drop)
+			 u64 owner_objectid, int pin, int mark_free)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -2243,7 +2732,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
-	path->leave_spinning = 1;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, parent, root_objectid,
 				    ref_generation, owner_objectid, 1);
@@ -2265,11 +2753,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				break;
 		}
 		if (!found_extent) {
-			ret = remove_extent_backref(trans, extent_root, path,
-						    refs_to_drop);
+			ret = remove_extent_backref(trans, extent_root, path);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
-			path->leave_spinning = 1;
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
 			if (ret) {
@@ -2285,9 +2771,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu gen %llu owner %llu\n",
+		       "root %llu gen %llu owner %llu\n",
 		       (unsigned long long)bytenr,
-		       (unsigned long long)parent,
 		       (unsigned long long)root_objectid,
 		       (unsigned long long)ref_generation,
 		       (unsigned long long)owner_objectid);
@@ -2297,23 +2782,17 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
-
-	/*
-	 * we're not allowed to delete the extent item if there
-	 * are other delayed ref updates pending
-	 */
-
-	BUG_ON(refs < refs_to_drop);
-	refs -= refs_to_drop;
+	BUG_ON(refs == 0);
+	refs -= 1;
 	btrfs_set_extent_refs(leaf, ei, refs);
+
 	btrfs_mark_buffer_dirty(leaf);
 
-	if (refs == 0 && found_extent &&
-	    path->slots[0] == extent_slot + 1) {
+	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
 		struct btrfs_extent_ref *ref;
 		ref = btrfs_item_ptr(leaf, path->slots[0],
 				     struct btrfs_extent_ref);
-		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
 		/* if the back ref and the extent are next to each other
 		 * they get deleted below in one shot
 		 */
@@ -2321,13 +2800,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		num_to_del = 2;
 	} else if (found_extent) {
 		/* otherwise delete the extent back ref */
-		ret = remove_extent_backref(trans, extent_root, path,
-					    refs_to_drop);
+		ret = remove_extent_backref(trans, extent_root, path);
 		BUG_ON(ret);
 		/* if refs are 0, we need to setup the path for deletion */
 		if (refs == 0) {
 			btrfs_release_path(extent_root, path);
-			path->leave_spinning = 1;
 			ret = btrfs_search_slot(trans, extent_root, &key, path,
 						-1, 1);
 			BUG_ON(ret);
@@ -2337,18 +2814,16 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (refs == 0) {
 		u64 super_used;
 		u64 root_used;
-		struct extent_buffer *must_clean = NULL;
 
 		if (pin) {
-			ret = pin_down_bytes(trans, root, path,
-				bytenr, num_bytes,
-				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
-				&must_clean);
+			mutex_lock(&root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
 		}
-
 		/* block accounting for super block */
 		spin_lock(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2360,34 +2835,14 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_root_used(&root->root_item,
 					   root_used - num_bytes);
 		spin_unlock(&info->delalloc_lock);
-
-		/*
-		 * it is going to be very rare for someone to be waiting
-		 * on the block we're freeing.  del_items might need to
-		 * schedule, so rather than get fancy, just force it
-		 * to blocking here
-		 */
-		if (must_clean)
-			btrfs_set_lock_blocking(must_clean);
-
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
 		btrfs_release_path(extent_root, path);
 
-		if (must_clean) {
-			clean_tree_block(NULL, root, must_clean);
-			btrfs_tree_unlock(must_clean);
-			free_extent_buffer(must_clean);
-		}
-
 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
-		} else {
-			invalidate_mapping_pages(info->btree_inode->i_mapping,
-			     bytenr >> PAGE_CACHE_SHIFT,
-			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
 		}
 
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2395,103 +2850,218 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
+	finish_current_insert(trans, extent_root, 0);
 	return ret;
 }
 
 /*
- * remove an extent from the root, returns 0 on success
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
  */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					u64 bytenr, u64 num_bytes, u64 parent,
-					u64 root_objectid, u64 ref_generation,
-					u64 owner_objectid, int pin,
-					int refs_to_drop)
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all)
 {
-	WARN_ON(num_bytes < root->sectorsize);
+	int ret;
+	int err = 0;
+	u64 start;
+	u64 end;
+	u64 priv;
+	u64 search = 0;
+	int nr = 0, skipped = 0;
+	struct extent_io_tree *pending_del;
+	struct extent_io_tree *extent_ins;
+	struct pending_extent_op *extent_op;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct list_head delete_list;
+
+	INIT_LIST_HEAD(&delete_list);
+	extent_ins = &extent_root->fs_info->extent_ins;
+	pending_del = &extent_root->fs_info->pending_del;
 
-	/*
-	 * if metadata always pin
-	 * if data pin when any transaction has committed this
-	 */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
-	    ref_generation != trans->transid)
-		pin = 1;
+again:
+	mutex_lock(&info->extent_ins_mutex);
+	while (1) {
+		ret = find_first_extent_bit(pending_del, search, &start, &end,
+					    EXTENT_WRITEBACK);
+		if (ret) {
+			if (all && skipped && !nr) {
+				search = 0;
+				skipped = 0;
+				continue;
+			}
+			mutex_unlock(&info->extent_ins_mutex);
+			break;
+		}
 
-	if (ref_generation != trans->transid)
-		pin = 1;
+		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			skipped = 1;
 
-	return __free_extent(trans, root, bytenr, num_bytes, parent,
-			    root_objectid, ref_generation,
-			    owner_objectid, pin, pin == 0, refs_to_drop);
+			if (need_resched()) {
+				mutex_unlock(&info->extent_ins_mutex);
+				cond_resched();
+				mutex_lock(&info->extent_ins_mutex);
+			}
+
+			continue;
+		}
+		BUG_ON(ret < 0);
+
+		ret = get_state_private(pending_del, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
+		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
+				  GFP_NOFS);
+		if (!test_range_bit(extent_ins, start, end,
+				    EXTENT_WRITEBACK, 0)) {
+			list_add_tail(&extent_op->list, &delete_list);
+			nr++;
+		} else {
+			kfree(extent_op);
+
+			ret = get_state_private(&info->extent_ins, start,
+						&priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+						(unsigned long)priv;
+
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_WRITEBACK, GFP_NOFS);
+
+			if (extent_op->type == PENDING_BACKREF_UPDATE) {
+				list_add_tail(&extent_op->list, &delete_list);
+				search = end + 1;
+				nr++;
+				continue;
+			}
+
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, start,
+					     end + 1 - start, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+			ret = update_block_group(trans, extent_root, start,
+						end + 1 - start, 0, ret > 0);
+
+			unlock_extent(extent_ins, start, end, GFP_NOFS);
+			BUG_ON(ret);
+			kfree(extent_op);
+		}
+		if (ret)
+			err = ret;
+
+		search = end + 1;
+
+		if (need_resched()) {
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			mutex_lock(&info->extent_ins_mutex);
+		}
+	}
+
+	if (nr) {
+		ret = free_extents(trans, extent_root, &delete_list);
+		BUG_ON(ret);
+	}
+
+	if (all && skipped) {
+		INIT_LIST_HEAD(&delete_list);
+		search = 0;
+		nr = 0;
+		goto again;
+	}
+
+	if (!err)
+		finish_current_insert(trans, extent_root, 0);
+	return err;
 }
 
 /*
- * when we free an extent, it is possible (and likely) that we free the last
- * delayed ref for that extent as well.  This searches the delayed ref tree for
- * a given extent, and if there are no other delayed refs to be processed, it
- * removes it from the tree.
+ * remove an extent from the root, returns 0 on success
  */
-static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root, u64 bytenr)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 root_objectid, u64 ref_generation,
+			       u64 owner_objectid, int pin)
 {
-	struct btrfs_delayed_ref_head *head;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct rb_node *node;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	int pending_ret;
 	int ret;
 
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(trans, bytenr);
-	if (!head)
-		goto out;
-
-	node = rb_prev(&head->node.rb_node);
-	if (!node)
-		goto out;
-
-	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-	/* there are still entries for this ref, we can't drop it */
-	if (ref->bytenr == bytenr)
-		goto out;
+	WARN_ON(num_bytes < root->sectorsize);
+	if (root == extent_root) {
+		struct pending_extent_op *extent_op = NULL;
+
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+						(unsigned long)priv;
 
-	/*
-	 * waiting for the lock here would deadlock.  If someone else has it
-	 * locked they are already in the process of dropping it anyway
-	 */
-	if (!mutex_trylock(&head->mutex))
-		goto out;
+			extent_op->del = 1;
+			if (extent_op->type == PENDING_EXTENT_INSERT) {
+				mutex_unlock(&root->fs_info->extent_ins_mutex);
+				return 0;
+			}
+		}
 
-	/*
-	 * at this point we have a head with no other entries.  Go
-	 * ahead and process it.
-	 */
-	head->node.in_tree = 0;
-	rb_erase(&head->node.rb_node, &delayed_refs->root);
+		if (extent_op) {
+			ref_generation = extent_op->orig_generation;
+			parent = extent_op->orig_parent;
+		}
 
-	delayed_refs->num_entries--;
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_DELETE;
+		extent_op->bytenr = bytenr;
+		extent_op->num_bytes = num_bytes;
+		extent_op->parent = parent;
+		extent_op->orig_parent = parent;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = ref_generation;
+		extent_op->level = (int)owner_objectid;
+		INIT_LIST_HEAD(&extent_op->list);
+		extent_op->del = 0;
+
+		set_extent_bits(&root->fs_info->pending_del,
+				bytenr, bytenr + num_bytes - 1,
+				EXTENT_WRITEBACK, GFP_NOFS);
+		set_state_private(&root->fs_info->pending_del,
+				  bytenr, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
+		return 0;
+	}
+	/* if metadata always pin */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+			mutex_lock(&root->fs_info->pinned_mutex);
+			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+			mutex_unlock(&root->fs_info->pinned_mutex);
+			update_reserved_extents(root, bytenr, num_bytes, 0);
+			return 0;
+		}
+		pin = 1;
+	}
 
-	/*
-	 * we don't take a ref on the node because we're removing it from the
-	 * tree, so we just steal the ref the tree was holding.
-	 */
-	delayed_refs->num_heads--;
-	if (list_empty(&head->cluster))
-		delayed_refs->num_heads_ready--;
+	/* if data pin when any transaction has committed this */
+	if (ref_generation != trans->transid)
+		pin = 1;
 
-	list_del_init(&head->cluster);
-	spin_unlock(&delayed_refs->lock);
+	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+			    root_objectid, ref_generation,
+			    owner_objectid, pin, pin == 0);
 
-	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-				  &head->node, head->must_insert_reserved);
-	BUG_ON(ret);
-	btrfs_put_delayed_ref(&head->node);
-	return 0;
-out:
-	spin_unlock(&delayed_refs->lock);
-	return 0;
+	finish_current_insert(trans, root->fs_info->extent_root, 0);
+	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
+	return ret ? ret : pending_ret;
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -2502,30 +3072,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	/*
-	 * tree log blocks never actually go into the extent allocation
-	 * tree, just update pinning info and exit early.
-	 *
-	 * data extents referenced by the tree log do need to have
-	 * their reference counts bumped.
-	 */
-	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-		mutex_lock(&root->fs_info->pinned_mutex);
-
-		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-		update_reserved_extents(root, bytenr, num_bytes, 0);
-		ret = 0;
-	} else {
-		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
-				       root_objectid, ref_generation,
-				       owner_objectid,
-				       BTRFS_DROP_DELAYED_REF, 1);
-		BUG_ON(ret);
-		ret = check_ref_cleanup(trans, root, bytenr);
-		BUG_ON(ret);
-	}
+	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+				  root_objectid, ref_generation,
+				  owner_objectid, pin);
 	return ret;
 }
 
@@ -2926,10 +3475,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins,
-					 int ref_mod)
+					 u64 owner, struct btrfs_key *ins)
 {
 	int ret;
+	int pending_ret;
 	u64 super_used;
 	u64 root_used;
 	u64 num_bytes = ins->offset;
@@ -2954,6 +3503,33 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 	spin_unlock(&info->delalloc_lock);
 
+	if (root == extent_root) {
+		struct pending_extent_op *extent_op;
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_INSERT;
+		extent_op->bytenr = ins->objectid;
+		extent_op->num_bytes = ins->offset;
+		extent_op->parent = parent;
+		extent_op->orig_parent = 0;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = 0;
+		extent_op->level = (int)owner;
+		INIT_LIST_HEAD(&extent_op->list);
+		extent_op->del = 0;
+
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
+				ins->objectid + ins->offset - 1,
+				EXTENT_WRITEBACK, GFP_NOFS);
+		set_state_private(&root->fs_info->extent_ins,
+				  ins->objectid, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
+		goto update_block;
+	}
+
 	memcpy(&keys[0], ins, sizeof(*ins));
 	keys[1].objectid = ins->objectid;
 	keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -2964,31 +3540,37 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
 				       sizes, 2);
 	BUG_ON(ret);
 
 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
+	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_extent_ref);
 
 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	trans->alloc_exclude_start = 0;
 	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
+	finish_current_insert(trans, extent_root, 0);
+	pending_ret = del_pending_extents(trans, extent_root, 0);
 
 	if (ret)
 		goto out;
+	if (pending_ret) {
+		ret = pending_ret;
+		goto out;
+	}
 
+update_block:
 	ret = update_block_group(trans, root, ins->objectid,
 				 ins->offset, 1, 0);
 	if (ret) {
@@ -3010,12 +3592,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
-
-	ret = btrfs_add_delayed_ref(trans, ins->objectid,
-				    ins->offset, parent, root_objectid,
-				    ref_generation, owner,
-				    BTRFS_ADD_DELAYED_EXTENT, 0);
-	BUG_ON(ret);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
+	update_reserved_extents(root, ins->objectid, ins->offset, 0);
 	return ret;
 }
 
@@ -3042,7 +3621,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	put_block_group(block_group);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins, 1);
+					    ref_generation, owner, ins);
 	return ret;
 }
 
@@ -3061,18 +3640,20 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
+
 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
 	BUG_ON(ret);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_ref(trans, ins->objectid,
-					    ins->offset, parent, root_objectid,
-					    ref_generation, owner_objectid,
-					    BTRFS_ADD_DELAYED_EXTENT, 0);
+		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					root_objectid, ref_generation,
+					owner_objectid, ins);
 		BUG_ON(ret);
+
+	} else {
+		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	return ret;
 }
 
@@ -3208,7 +3789,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
-		ret = btrfs_free_extent(trans, root, disk_bytenr,
+		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, 0);
@@ -3248,7 +3829,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	 */
 	for (i = 0; i < ref->nritems; i++) {
 		info = ref->extents + sorted[i].slot;
-		ret = btrfs_free_extent(trans, root, info->bytenr,
+		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
 					  info->objectid, 0);
@@ -3265,13 +3846,12 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
 				     u64 len, u32 *refs)
 {
 	int ret;
 
-	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
+	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3379,8 +3959,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
 		 * we just decrement it below and don't update any
 		 * of the refs the leaf points to.
 		 */
-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
-						blocksize, &refs);
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1)
 			continue;
@@ -3431,7 +4010,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
 	 */
 	for (i = 0; i < refi; i++) {
 		bytenr = sorted[i].bytenr;
-		ret = btrfs_free_extent(trans, root, bytenr,
+		ret = __btrfs_free_extent(trans, root, bytenr,
 					blocksize, eb->start,
 					root_owner, root_gen, 0, 1);
 		BUG_ON(ret);
@@ -3474,7 +4053,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
+	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
 				path->nodes[*level]->len, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -3525,8 +4104,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
-						blocksize, &refs);
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 
 		/*
@@ -3541,7 +4119,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 
-			ret = btrfs_free_extent(trans, root, bytenr,
+			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
 						root_owner, root_gen,
 						*level - 1, 1);
@@ -3587,7 +4165,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	 * cleanup and free the reference on the last node
 	 * we processed
 	 */
-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
 	free_extent_buffer(path->nodes[*level]);
@@ -3776,7 +4354,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_path *path;
 	int i;
 	int orig_level;
-	int update_count;
 	struct btrfs_root_item *root_item = &root->root_item;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -3818,7 +4395,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 	}
 	while (1) {
-		unsigned long update;
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -3831,21 +4407,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		if (trans->transaction->in_commit ||
-		    trans->transaction->delayed_refs.flushing) {
+		if (trans->transaction->in_commit) {
 			ret = -EAGAIN;
 			break;
 		}
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
-		for (update_count = 0; update_count < 16; update_count++) {
-			update = trans->delayed_ref_updates;
-			trans->delayed_ref_updates = 0;
-			if (update)
-				btrfs_run_delayed_refs(trans, root, update);
-			else
-				break;
-		}
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
@@ -4890,7 +5457,6 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					root->root_key.objectid,
 					trans->transid, key.objectid);
 		BUG_ON(ret);
-
 		ret = btrfs_free_extent(trans, root,
 					bytenr, num_bytes, leaf->start,
 					btrfs_header_owner(leaf),
@@ -5202,6 +5768,9 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
 				ref_path, NULL, NULL);
 	BUG_ON(ret);
 
+	if (root == root->fs_info->extent_root)
+		btrfs_extent_post_op(trans, root);
+
 	return 0;
 }
 
@@ -5469,7 +6038,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
 	if (ret)
 		goto out;
@@ -5640,9 +6208,6 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	trans = btrfs_start_transaction(info->tree_root, 1);
-	btrfs_commit_transaction(trans, info->tree_root);
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -5901,7 +6466,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	extent_root = root->fs_info->extent_root;
 
-	root->fs_info->last_trans_log_full_commit = trans->transid;
+	root->fs_info->last_trans_new_blockgroup = trans->transid;
 
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	if (!cache)
@@ -5935,6 +6500,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 				sizeof(cache->item));
 	BUG_ON(ret);
 
+	finish_current_insert(trans, extent_root, 0);
+	ret = del_pending_extents(trans, extent_root, 0);
+	BUG_ON(ret);
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
 	return 0;
diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c
index 08085af089e2..ebe6b29e6069 100644
--- a/trunk/fs/btrfs/extent_io.c
+++ b/trunk/fs/btrfs/extent_io.c
@@ -3124,15 +3124,20 @@ void free_extent_buffer(struct extent_buffer *eb)
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb)
 {
+	int set;
 	unsigned long i;
 	unsigned long num_pages;
 	struct page *page;
 
+	u64 start = eb->start;
+	u64 end = start + eb->len - 1;
+
+	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
 	num_pages = num_extent_pages(eb->start, eb->len);
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!PageDirty(page))
+		if (!set && !PageDirty(page))
 			continue;
 
 		lock_page(page);
@@ -3141,6 +3146,22 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 		else
 			set_page_private(page, EXTENT_PAGE_PRIVATE);
 
+		/*
+		 * if we're on the last page or the first page and the
+		 * block isn't aligned on a page boundary, do extra checks
+		 * to make sure we don't clean page that is partially dirty
+		 */
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			start = (u64)page->index << PAGE_CACHE_SHIFT;
+			end  = start + PAGE_CACHE_SIZE - 1;
+			if (test_range_bit(tree, start, end,
+					   EXTENT_DIRTY, 0)) {
+				unlock_page(page);
+				continue;
+			}
+		}
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
@@ -3166,13 +3187,29 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 {
 	unsigned long i;
 	unsigned long num_pages;
-	int was_dirty = 0;
 
-	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++)
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = extent_buffer_page(eb, i);
+		/* writepage may need to do something special for the
+		 * first page, we have to make sure page->private is
+		 * properly set.  releasepage may drop page->private
+		 * on us if the page isn't already dirty.
+		 */
+		lock_page(page);
+		if (i == 0) {
+			set_page_extent_head(page, eb->len);
+		} else if (PagePrivate(page) &&
+			   page->private != EXTENT_PAGE_PRIVATE) {
+			set_page_extent_mapped(page);
+		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-	return was_dirty;
+		set_extent_dirty(tree, page_offset(page),
+				 page_offset(page) + PAGE_CACHE_SIZE - 1,
+				 GFP_NOFS);
+		unlock_page(page);
+	}
+	return 0;
 }
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3752,10 +3789,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 		ret = 0;
 		goto out;
 	}
-	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		ret = 0;
-		goto out;
-	}
 	/* at this point we can safely release the extent buffer */
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++)
diff --git a/trunk/fs/btrfs/extent_io.h b/trunk/fs/btrfs/extent_io.h
index 5bc20abf3f3d..1f9df88afbf6 100644
--- a/trunk/fs/btrfs/extent_io.h
+++ b/trunk/fs/btrfs/extent_io.h
@@ -25,7 +25,6 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
-#define EXTENT_BUFFER_DIRTY 2
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -255,8 +254,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb);
-int test_extent_buffer_dirty(struct extent_io_tree *tree,
-			     struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/trunk/fs/btrfs/file-item.c b/trunk/fs/btrfs/file-item.c
index 9b99886562d0..964652435fd1 100644
--- a/trunk/fs/btrfs/file-item.c
+++ b/trunk/fs/btrfs/file-item.c
@@ -52,7 +52,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	file_key.offset = pos;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
-	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
 	if (ret < 0)
@@ -524,7 +523,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		key.offset = end_byte - 1;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
 
-		path->leave_spinning = 1;
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret > 0) {
 			if (path->slots[0] == 0)
@@ -759,10 +757,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	} else {
 		ins_size = csum_size;
 	}
-	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      ins_size);
-	path->leave_spinning = 0;
 	if (ret < 0)
 		goto fail_unlock;
 	if (ret != 0) {
@@ -780,6 +776,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
+	cond_resched();
 next_sector:
 
 	if (!eb_token ||
@@ -820,9 +817,9 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	cond_resched();
 	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
-		cond_resched();
 		goto again;
 	}
 out:
diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c
index 9c9fb46ccd08..dc78954861b3 100644
--- a/trunk/fs/btrfs/file.c
+++ b/trunk/fs/btrfs/file.c
@@ -606,7 +606,6 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 
 			btrfs_release_path(root, path);
-			path->leave_spinning = 1;
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
 			BUG_ON(ret);
@@ -640,22 +639,17 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 							ram_bytes);
 			btrfs_set_file_extent_type(leaf, extent, found_type);
 
-			btrfs_unlock_up_safe(path, 1);
 			btrfs_mark_buffer_dirty(path->nodes[0]);
-			btrfs_set_lock_blocking(path->nodes[0]);
 
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
-						disk_bytenr,
-						le64_to_cpu(old.disk_num_bytes),
-						orig_parent,
+						disk_bytenr, orig_parent,
 						leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
 
 				BUG_ON(ret);
 			}
-			path->leave_spinning = 0;
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0)
 				inode_add_bytes(inode, extent_end - end);
@@ -918,7 +912,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 
 	if (orig_parent != leaf->start) {
-		ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+		ret = btrfs_update_extent_ref(trans, root, bytenr,
 					      orig_parent, leaf->start,
 					      root->root_key.objectid,
 					      trans->transid, inode->i_ino);
@@ -1161,20 +1155,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		page_cache_release(pinned[1]);
 	*ppos = pos;
 
-	/*
-	 * we want to make sure fsync finds this change
-	 * but we haven't joined a transaction running right now.
-	 *
-	 * Later on, someone is sure to update the inode and get the
-	 * real transid recorded.
-	 *
-	 * We set last_trans now to the fs_info generation + 1,
-	 * this will either be one more than the running transaction
-	 * or the generation used for the next transaction if there isn't
-	 * one running right now.
-	 */
-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
-
 	if (num_written > 0 && will_write) {
 		struct btrfs_trans_handle *trans;
 
@@ -1187,11 +1167,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 			ret = btrfs_log_dentry_safe(trans, root,
 						    file->f_dentry);
 			if (ret == 0) {
-				ret = btrfs_sync_log(trans, root);
-				if (ret == 0)
-					btrfs_end_transaction(trans, root);
-				else
-					btrfs_commit_transaction(trans, root);
+				btrfs_sync_log(trans, root);
+				btrfs_end_transaction(trans, root);
 			} else {
 				btrfs_commit_transaction(trans, root);
 			}
@@ -1208,18 +1185,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
-	/*
-	 * ordered_data_close is set by settattr when we are about to truncate
-	 * a file from a non-zero size to a zero size.  This tries to
-	 * flush down new bytes that may have been written if the
-	 * application were using truncate to replace a file in place.
-	 */
-	if (BTRFS_I(inode)->ordered_data_close) {
-		BTRFS_I(inode)->ordered_data_close = 0;
-		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
-		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
-			filemap_flush(inode->i_mapping);
-	}
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
@@ -1295,11 +1260,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
-		ret = btrfs_sync_log(trans, root);
-		if (ret == 0)
-			ret = btrfs_end_transaction(trans, root);
-		else
-			ret = btrfs_commit_transaction(trans, root);
+		btrfs_sync_log(trans, root);
+		ret = btrfs_end_transaction(trans, root);
 	}
 	mutex_lock(&dentry->d_inode->i_mutex);
 out:
diff --git a/trunk/fs/btrfs/inode-item.c b/trunk/fs/btrfs/inode-item.c
index 6b627c611808..3d46fa1f29a4 100644
--- a/trunk/fs/btrfs/inode-item.c
+++ b/trunk/fs/btrfs/inode-item.c
@@ -73,8 +73,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->leave_spinning = 1;
-
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
@@ -129,7 +127,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      ins_len);
 	if (ret == -EEXIST) {
diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c
index 06d8db5afb08..17e608c4dc70 100644
--- a/trunk/fs/btrfs/inode.c
+++ b/trunk/fs/btrfs/inode.c
@@ -134,7 +134,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->leave_spinning = 1;
 	btrfs_set_trans_block_group(trans, inode);
 
 	key.objectid = inode->i_ino;
@@ -168,9 +167,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
-			kaddr = kmap_atomic(cpage, KM_USER0);
+			kaddr = kmap(cpage);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap_atomic(kaddr, KM_USER0);
+			kunmap(cpage);
 
 			i++;
 			ptr += cur_size;
@@ -205,7 +204,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode, u64 start, u64 end,
 				 size_t compressed_size,
@@ -855,6 +854,11 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	u64 cur_end;
 	int limit = 10 * 1024 * 1042;
 
+	if (!btrfs_test_opt(root, COMPRESS)) {
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
+	}
+
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
 			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
 	while (start < end) {
@@ -931,8 +935,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static noinline int run_delalloc_nocow(struct inode *inode,
-				       struct page *locked_page,
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
 			      u64 start, u64 end, int *page_started, int force,
 			      unsigned long *nr_written)
 {
@@ -1130,7 +1133,6 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 			      unsigned long *nr_written)
 {
 	int ret;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1138,12 +1140,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	else if (!btrfs_test_opt(root, COMPRESS))
-		ret = cow_file_range(inode, locked_page, start, end,
-				      page_started, nr_written, 1);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
 					   page_started, nr_written);
+
 	return ret;
 }
 
@@ -1453,7 +1453,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	path->leave_spinning = 1;
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
 				 file_pos + num_bytes, file_pos, &hint);
 	BUG_ON(ret);
@@ -1476,10 +1475,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_compression(leaf, fi, compression);
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
-
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_set_lock_blocking(leaf);
-
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_add_bytes(inode, num_bytes);
@@ -1492,35 +1487,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 					  root->root_key.objectid,
 					  trans->transid, inode->i_ino, &ins);
 	BUG_ON(ret);
-	btrfs_free_path(path);
 
+	btrfs_free_path(path);
 	return 0;
 }
 
-/*
- * helper function for btrfs_finish_ordered_io, this
- * just reads in some of the csum leaves to prime them into ram
- * before we start the transaction.  It limits the amount of btree
- * reads required while inside the transaction.
- */
-static noinline void reada_csum(struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_ordered_extent *ordered_extent)
-{
-	struct btrfs_ordered_sum *sum;
-	u64 bytenr;
-
-	sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
-			 list);
-	bytenr = sum->sums[0].bytenr;
-
-	/*
-	 * we don't care about the results, the point of this search is
-	 * just to get the btree leaves into ram
-	 */
-	btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
-}
-
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1529,9 +1500,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_ordered_extent *ordered_extent = NULL;
+	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_path *path;
 	int compressed = 0;
 	int ret;
 
@@ -1539,33 +1509,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (!ret)
 		return 0;
 
-	/*
-	 * before we join the transaction, try to do some of our IO.
-	 * This will limit the amount of IO that we have to do with
-	 * the transaction running.  We're unlikely to need to do any
-	 * IO if the file extents are new, the disk_i_size checks
-	 * covers the most common case.
-	 */
-	if (start < BTRFS_I(inode)->disk_i_size) {
-		path = btrfs_alloc_path();
-		if (path) {
-			ret = btrfs_lookup_file_extent(NULL, root, path,
-						       inode->i_ino,
-						       start, 0);
-			ordered_extent = btrfs_lookup_ordered_extent(inode,
-								     start);
-			if (!list_empty(&ordered_extent->list)) {
-				btrfs_release_path(root, path);
-				reada_csum(root, path, ordered_extent);
-			}
-			btrfs_free_path(path);
-		}
-	}
-
 	trans = btrfs_join_transaction(root, 1);
 
-	if (!ordered_extent)
-		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
@@ -2155,7 +2101,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	path->leave_spinning = 1;
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -2202,7 +2147,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto err;
 	}
 
-	path->leave_spinning = 1;
 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -2246,6 +2190,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir->i_ino);
 	BUG_ON(ret != 0 && ret != -ENOENT);
+	if (ret != -ENOENT)
+		BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
@@ -2278,9 +2224,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
-
-	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
-
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
 
@@ -2555,7 +2498,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.type = (u8)-1;
 
 search_again:
-	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto error;
@@ -2702,7 +2644,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			break;
 		}
 		if (found_extent) {
-			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
 						leaf->start, root_owner,
@@ -2907,21 +2848,11 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
-	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-		if (attr->ia_size > inode->i_size) {
-			err = btrfs_cont_expand(inode, attr->ia_size);
-			if (err)
-				return err;
-		} else if (inode->i_size > 0 &&
-			   attr->ia_size == 0) {
-
-			/* we're truncating a file that used to have good
-			 * data down to zero.  Make sure it gets into
-			 * the ordered flush list so that any new writes
-			 * get down to disk quickly.
-			 */
-			BTRFS_I(inode)->ordered_data_close = 1;
-		}
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+		err = btrfs_cont_expand(inode, attr->ia_size);
+		if (err)
+			return err;
 	}
 
 	err = inode_setattr(inode, attr);
@@ -3053,14 +2984,13 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
-	bi->last_unlink_trans = 0;
+	bi->log_dirty_trans = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3519,7 +3449,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	sizes[0] = sizeof(struct btrfs_inode_item);
 	sizes[1] = name_len + sizeof(*ref);
 
-	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
 	if (ret != 0)
 		goto fail;
@@ -3798,8 +3727,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 
 	nr = trans->blocks_used;
-
-	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
@@ -4436,8 +4363,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
-
-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
@@ -4463,27 +4388,6 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 
 	trans = btrfs_start_transaction(root, 1);
-
-	/*
-	 * setattr is responsible for setting the ordered_data_close flag,
-	 * but that is only tested during the last file release.  That
-	 * could happen well after the next commit, leaving a great big
-	 * window where new writes may get lost if someone chooses to write
-	 * to this file after truncating to zero
-	 *
-	 * The inode doesn't have any dirty data here, and so if we commit
-	 * this is a noop.  If someone immediately starts writing to the inode
-	 * it is very likely we'll catch some of their writes in this
-	 * transaction, and the commit will find this file on the ordered
-	 * data list with good things to send down.
-	 *
-	 * This is a best effort solution, there is still a window where
-	 * using truncate to replace the contents of the file will
-	 * end up with a zero length file after a crash.
-	 */
-	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
-		btrfs_add_ordered_operation(trans, root, inode);
-
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_i_size_write(inode, inode->i_size);
 
@@ -4560,15 +4464,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
 	INIT_LIST_HEAD(&ei->i_orphan);
-	INIT_LIST_HEAD(&ei->ordered_operations);
 	return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
 	struct btrfs_ordered_extent *ordered;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
@@ -4579,24 +4480,13 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-	/*
-	 * Make sure we're properly removed from the ordered operation
-	 * lists.
-	 */
-	smp_mb();
-	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		spin_lock(&root->fs_info->ordered_extent_lock);
-		list_del_init(&BTRFS_I(inode)->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_extent_lock);
-	}
-
-	spin_lock(&root->list_lock);
+	spin_lock(&BTRFS_I(inode)->root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
 		       " list\n", inode->i_ino);
 		dump_stack();
 	}
-	spin_unlock(&root->list_lock);
+	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4721,36 +4611,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_unlock;
 
-	/*
-	 * we're using rename to replace one file with another.
-	 * and the replacement file is large.  Start IO on it now so
-	 * we don't add too much work to the end of the transaction
-	 */
-	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
-	    new_inode->i_size &&
-	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
-		filemap_flush(old_inode->i_mapping);
-
 	trans = btrfs_start_transaction(root, 1);
 
-	/*
-	 * make sure the inode gets flushed if it is replacing
-	 * something.
-	 */
-	if (new_inode && new_inode->i_size &&
-	    old_inode && S_ISREG(old_inode->i_mode)) {
-		btrfs_add_ordered_operation(trans, root, old_inode);
-	}
-
-	/*
-	 * this is an ugly little race, but the rename is required to make
-	 * sure that if we crash, the inode is either at the old name
-	 * or the new one.  pinning the log transaction lets us make sure
-	 * we don't allow a log commit to come in after we unlink the
-	 * name but before we add the new name back in.
-	 */
-	btrfs_pin_log_trans(root);
-
 	btrfs_set_trans_block_group(trans, new_dir);
 
 	btrfs_inc_nlink(old_dentry->d_inode);
@@ -4758,9 +4620,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
 
-	if (old_dentry->d_parent != new_dentry->d_parent)
-		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
-
 	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
 				 old_dentry->d_name.name,
 				 old_dentry->d_name.len);
@@ -4792,14 +4651,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_fail;
 
-	btrfs_log_new_name(trans, old_inode, old_dir,
-				       new_dentry->d_parent);
 out_fail:
-
-	/* this btrfs_end_log_trans just allows the current
-	 * log-sub transaction to complete
-	 */
-	btrfs_end_log_trans(root);
 	btrfs_end_transaction_throttle(trans, root);
 out_unlock:
 	return ret;
diff --git a/trunk/fs/btrfs/locking.c b/trunk/fs/btrfs/locking.c
index a5310c0f41e2..47b0a88c12a2 100644
--- a/trunk/fs/btrfs/locking.c
+++ b/trunk/fs/btrfs/locking.c
@@ -71,13 +71,12 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 static int btrfs_spin_on_block(struct extent_buffer *eb)
 {
 	int i;
-
 	for (i = 0; i < 512; i++) {
+		cpu_relax();
 		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
 			return 1;
 		if (need_resched())
 			break;
-		cpu_relax();
 	}
 	return 0;
 }
@@ -96,15 +95,13 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
 	int i;
 
-	if (btrfs_spin_on_block(eb)) {
-		spin_nested(eb);
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		spin_unlock(&eb->lock);
-	}
+	spin_nested(eb);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		return 1;
+	spin_unlock(&eb->lock);
+
 	/* spin for a bit on the BLOCKING flag */
 	for (i = 0; i < 2; i++) {
-		cpu_relax();
 		if (!btrfs_spin_on_block(eb))
 			break;
 
@@ -151,9 +148,6 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 	DEFINE_WAIT(wait);
 	wait.func = btrfs_wake_function;
 
-	if (!btrfs_spin_on_block(eb))
-		goto sleep;
-
 	while(1) {
 		spin_nested(eb);
 
@@ -171,10 +165,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 		 * spin for a bit, and if the blocking flag goes away,
 		 * loop around
 		 */
-		cpu_relax();
 		if (btrfs_spin_on_block(eb))
 			continue;
-sleep:
+
 		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
diff --git a/trunk/fs/btrfs/ordered-data.c b/trunk/fs/btrfs/ordered-data.c
index 53c87b197d70..77c2411a5f0f 100644
--- a/trunk/fs/btrfs/ordered-data.c
+++ b/trunk/fs/btrfs/ordered-data.c
@@ -310,16 +310,6 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
-
-	/*
-	 * we have no more ordered extents for this inode and
-	 * no dirty pages.  We can safely remove it from the
-	 * list of ordered extents
-	 */
-	if (RB_EMPTY_ROOT(&tree->tree) &&
-	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-		list_del_init(&BTRFS_I(inode)->ordered_operations);
-	}
 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
 	mutex_unlock(&tree->mutex);
@@ -379,68 +369,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 	return 0;
 }
 
-/*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list.  These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io.  When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
-{
-	struct btrfs_inode *btrfs_inode;
-	struct inode *inode;
-	struct list_head splice;
-
-	INIT_LIST_HEAD(&splice);
-
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
-again:
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
-
-	while (!list_empty(&splice)) {
-		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-				   ordered_operations);
-
-		inode = &btrfs_inode->vfs_inode;
-
-		list_del_init(&btrfs_inode->ordered_operations);
-
-		/*
-		 * the inode may be getting freed (in sys_unlink path).
-		 */
-		inode = igrab(inode);
-
-		if (!wait && inode) {
-			list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
-		}
-		spin_unlock(&root->fs_info->ordered_extent_lock);
-
-		if (inode) {
-			if (wait)
-				btrfs_wait_ordered_range(inode, 0, (u64)-1);
-			else
-				filemap_flush(inode->i_mapping);
-			iput(inode);
-		}
-
-		cond_resched();
-		spin_lock(&root->fs_info->ordered_extent_lock);
-	}
-	if (wait && !list_empty(&root->fs_info->ordered_operations))
-		goto again;
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-	mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
-	return 0;
-}
-
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
@@ -798,49 +726,3 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 
 	return ret;
 }
-
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct inode *inode)
-{
-	u64 last_mod;
-
-	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-
-	/*
-	 * if this file hasn't been changed since the last transaction
-	 * commit, we can safely return without doing anything
-	 */
-	if (last_mod < root->fs_info->last_trans_committed)
-		return 0;
-
-	/*
-	 * the transaction is already committing.  Just start the IO and
-	 * don't bother with all of this list nonsense
-	 */
-	if (trans && root->fs_info->running_transaction->blocked) {
-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
-		return 0;
-	}
-
-	spin_lock(&root->fs_info->ordered_extent_lock);
-	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
-	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-
-	return 0;
-}
diff --git a/trunk/fs/btrfs/ordered-data.h b/trunk/fs/btrfs/ordered-data.h
index 3d31c8827b01..ab66d5e8d6d6 100644
--- a/trunk/fs/btrfs/ordered-data.h
+++ b/trunk/fs/btrfs/ordered-data.h
@@ -155,8 +155,4 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct inode *inode);
 #endif
diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c
index 664782c6a2df..4112d53d4f4d 100644
--- a/trunk/fs/btrfs/transaction.c
+++ b/trunk/fs/btrfs/transaction.c
@@ -65,15 +65,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
-
-		cur_trans->delayed_refs.root.rb_node = NULL;
-		cur_trans->delayed_refs.num_entries = 0;
-		cur_trans->delayed_refs.num_heads_ready = 0;
-		cur_trans->delayed_refs.num_heads = 0;
-		cur_trans->delayed_refs.flushing = 0;
-		cur_trans->delayed_refs.run_delayed_start = 0;
-		spin_lock_init(&cur_trans->delayed_refs.lock);
-
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		extent_io_tree_init(&cur_trans->dirty_pages,
@@ -191,8 +182,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->block_group = 0;
 	h->alloc_exclude_nr = 0;
 	h->alloc_exclude_start = 0;
-	h->delayed_ref_updates = 0;
-
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
@@ -282,6 +271,7 @@ void btrfs_throttle(struct btrfs_root *root)
 	if (!root->fs_info->open_ioctl_trans)
 		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
+
 	throttle_on_drops(root);
 }
 
@@ -290,27 +280,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *info = root->fs_info;
-	int count = 0;
-
-	while (count < 4) {
-		unsigned long cur = trans->delayed_ref_updates;
-		trans->delayed_ref_updates = 0;
-		if (cur &&
-		    trans->transaction->delayed_refs.num_heads_ready > 64) {
-			trans->delayed_ref_updates = 0;
-
-			/*
-			 * do a full flush if the transaction is trying
-			 * to close
-			 */
-			if (trans->transaction->delayed_refs.flushing)
-				cur = 0;
-			btrfs_run_delayed_refs(trans, root, cur);
-		} else {
-			break;
-		}
-		count++;
-	}
 
 	mutex_lock(&info->trans_mutex);
 	cur_trans = info->running_transaction;
@@ -455,10 +424,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	u64 old_root_bytenr;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 
+	btrfs_extent_post_op(trans, root);
 	btrfs_write_dirty_block_groups(trans, root);
-
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	btrfs_extent_post_op(trans, root);
 
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -470,14 +438,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 				     btrfs_header_level(root->node));
 		btrfs_set_root_generation(&root->root_item, trans->transid);
 
+		btrfs_extent_post_op(trans, root);
+
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
 		BUG_ON(ret);
 		btrfs_write_dirty_block_groups(trans, root);
-
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-		BUG_ON(ret);
+		btrfs_extent_post_op(trans, root);
 	}
 	return 0;
 }
@@ -491,18 +459,15 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
 	struct extent_buffer *eb;
-	int ret;
 
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	btrfs_extent_post_op(trans, fs_info->tree_root);
 
 	eb = btrfs_lock_root_node(fs_info->tree_root);
-	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	btrfs_extent_post_op(trans, fs_info->tree_root);
 
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
@@ -510,9 +475,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		root = list_entry(next, struct btrfs_root, dirty_list);
 
 		update_cowonly_root(trans, root);
-
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -672,31 +634,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	return 0;
 }
 
-/*
- * when dropping snapshots, we generate a ton of delayed refs, and it makes
- * sense not to join the transaction while it is trying to flush the current
- * queue of delayed refs out.
- *
- * This is used by the drop snapshot code only
- */
-static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
-{
-	DEFINE_WAIT(wait);
-
-	mutex_lock(&info->trans_mutex);
-	while (info->running_transaction &&
-	       info->running_transaction->delayed_refs.flushing) {
-		prepare_to_wait(&info->transaction_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&info->trans_mutex);
-		schedule();
-		mutex_lock(&info->trans_mutex);
-		finish_wait(&info->transaction_wait, &wait);
-	}
-	mutex_unlock(&info->trans_mutex);
-	return 0;
-}
-
 /*
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
@@ -724,22 +661,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		atomic_inc(&root->fs_info->throttles);
 
 		while (1) {
-			/*
-			 * we don't want to jump in and create a bunch of
-			 * delayed refs if the transaction is starting to close
-			 */
-			wait_transaction_pre_flush(tree_root->fs_info);
 			trans = btrfs_start_transaction(tree_root, 1);
-
-			/*
-			 * we've joined a transaction, make sure it isn't
-			 * closing right now
-			 */
-			if (trans->transaction->delayed_refs.flushing) {
-				btrfs_end_transaction(trans, tree_root);
-				continue;
-			}
-
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
 			if (ret != -EAGAIN)
@@ -844,7 +766,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
@@ -972,31 +894,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
-	int should_grow = 0;
-	unsigned long now = get_seconds();
-
-	btrfs_run_ordered_operations(root, 0);
-
-	/* make a pass through all the delayed refs we have so far
-	 * any runnings procs may add more while we are here
-	 */
-	ret = btrfs_run_delayed_refs(trans, root, 0);
-	BUG_ON(ret);
-
-	cur_trans = trans->transaction;
-	/*
-	 * set the flushing flag so procs in this transaction have to
-	 * start sending their work down.
-	 */
-	cur_trans->delayed_refs.flushing = 1;
-
-	ret = btrfs_run_delayed_refs(trans, root, 0);
-	BUG_ON(ret);
 
-	mutex_lock(&root->fs_info->trans_mutex);
 	INIT_LIST_HEAD(&dirty_fs_roots);
-	if (cur_trans->in_commit) {
-		cur_trans->use_count++;
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (trans->transaction->in_commit) {
+		cur_trans = trans->transaction;
+		trans->transaction->use_count++;
 		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
 
@@ -1019,6 +922,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
+	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
@@ -1033,9 +937,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
-		should_grow = 1;
-
 	do {
 		int snap_pending = 0;
 		joined = cur_trans->num_joined;
@@ -1048,7 +949,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		if (cur_trans->num_writers > 1)
 			timeout = MAX_SCHEDULE_TIMEOUT;
-		else if (should_grow)
+		else
 			timeout = 1;
 
 		mutex_unlock(&root->fs_info->trans_mutex);
@@ -1058,30 +959,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 
-		/*
-		 * rename don't use btrfs_join_transaction, so, once we
-		 * set the transaction to blocked above, we aren't going
-		 * to get any new ordered operations.  We can safely run
-		 * it here and no for sure that nothing new will be added
-		 * to the list
-		 */
-		btrfs_run_ordered_operations(root, 1);
-
-		smp_mb();
-		if (cur_trans->num_writers > 1 || should_grow)
-			schedule_timeout(timeout);
+		schedule_timeout(timeout);
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
 	} while (cur_trans->num_writers > 1 ||
-		 (should_grow && cur_trans->num_joined != joined));
+		 (cur_trans->num_joined != joined));
 
 	ret = create_pending_snapshots(trans, root->fs_info);
 	BUG_ON(ret);
 
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
-
 	WARN_ON(cur_trans != trans->transaction);
 
 	/* btrfs_commit_tree_roots is responsible for getting the
@@ -1145,7 +1032,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_copy_pinned(root, pinned_copy);
 
 	trans->transaction->blocked = 0;
-
 	wake_up(&root->fs_info->transaction_throttle);
 	wake_up(&root->fs_info->transaction_wait);
 
@@ -1172,7 +1058,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->fs_info->trans_mutex);
 
 	cur_trans->commit_done = 1;
-
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 
diff --git a/trunk/fs/btrfs/transaction.h b/trunk/fs/btrfs/transaction.h
index 94f5bde2b58d..ea292117f882 100644
--- a/trunk/fs/btrfs/transaction.h
+++ b/trunk/fs/btrfs/transaction.h
@@ -19,16 +19,10 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
-#include "delayed-ref.h"
 
 struct btrfs_transaction {
 	u64 transid;
-	/*
-	 * total writers in this transaction, it must be zero before the
-	 * transaction can end
-	 */
 	unsigned long num_writers;
-
 	unsigned long num_joined;
 	int in_commit;
 	int use_count;
@@ -40,7 +34,6 @@ struct btrfs_transaction {
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
-	struct btrfs_delayed_ref_root delayed_refs;
 };
 
 struct btrfs_trans_handle {
@@ -51,7 +44,6 @@ struct btrfs_trans_handle {
 	u64 block_group;
 	u64 alloc_exclude_start;
 	u64 alloc_exclude_nr;
-	unsigned long delayed_ref_updates;
 };
 
 struct btrfs_pending_snapshot {
diff --git a/trunk/fs/btrfs/tree-defrag.c b/trunk/fs/btrfs/tree-defrag.c
index b10eacdb1620..98d25fa4570e 100644
--- a/trunk/fs/btrfs/tree-defrag.c
+++ b/trunk/fs/btrfs/tree-defrag.c
@@ -124,6 +124,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_release_path(root, path);
+	if (is_extent)
+		btrfs_extent_post_op(trans, root);
 out:
 	if (path)
 		btrfs_free_path(path);
diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c
index fc9b87a7975b..9c462fbd60fa 100644
--- a/trunk/fs/btrfs/tree-log.c
+++ b/trunk/fs/btrfs/tree-log.c
@@ -34,49 +34,6 @@
 #define LOG_INODE_ALL 0
 #define LOG_INODE_EXISTS 1
 
-/*
- * directory trouble cases
- *
- * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
- * log, we must force a full commit before doing an fsync of the directory
- * where the unlink was done.
- * ---> record transid of last unlink/rename per directory
- *
- * mkdir foo/some_dir
- * normal commit
- * rename foo/some_dir foo2/some_dir
- * mkdir foo/some_dir
- * fsync foo/some_dir/some_file
- *
- * The fsync above will unlink the original some_dir without recording
- * it in its new location (foo2).  After a crash, some_dir will be gone
- * unless the fsync of some_file forces a full commit
- *
- * 2) we must log any new names for any file or dir that is in the fsync
- * log. ---> check inode while renaming/linking.
- *
- * 2a) we must log any new names for any file or dir during rename
- * when the directory they are being removed from was logged.
- * ---> check inode and old parent dir during rename
- *
- *  2a is actually the more important variant.  With the extra logging
- *  a crash might unlink the old name without recreating the new one
- *
- * 3) after a crash, we must go through any directories with a link count
- * of zero and redo the rm -rf
- *
- * mkdir f1/foo
- * normal commit
- * rm -rf f1/foo
- * fsync(f1)
- *
- * The directory f1 was fully removed from the FS, but fsync was never
- * called on f1, only its parent dir.  After a crash the rm -rf must
- * be replayed.  This must be able to recurse down the entire
- * directory tree.  The inode link count fixup code takes care of the
- * ugly details.
- */
-
 /*
  * stages for the tree walking.  The first
  * stage (0) is to only pin down the blocks we find
@@ -90,17 +47,12 @@
 #define LOG_WALK_REPLAY_INODES 1
 #define LOG_WALK_REPLAY_ALL 2
 
-static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct btrfs_root *log,
-				       struct btrfs_path *path,
-				       u64 dirid, int del_all);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -180,26 +132,11 @@ static int join_running_log_trans(struct btrfs_root *root)
 	return ret;
 }
 
-/*
- * This either makes the current running log transaction wait
- * until you call btrfs_end_log_trans() or it makes any future
- * log transactions wait until you call btrfs_end_log_trans()
- */
-int btrfs_pin_log_trans(struct btrfs_root *root)
-{
-	int ret = -ENOENT;
-
-	mutex_lock(&root->log_mutex);
-	atomic_inc(&root->log_writers);
-	mutex_unlock(&root->log_mutex);
-	return ret;
-}
-
 /*
  * indicate we're done making changes to the log tree
  * and wake up anyone waiting to do a sync
  */
-int btrfs_end_log_trans(struct btrfs_root *root)
+static int end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
 		smp_mb();
@@ -266,6 +203,7 @@ static int process_one_buffer(struct btrfs_root *log,
 		mutex_lock(&log->fs_info->pinned_mutex);
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
 					    eb->start, eb->len, 1);
+		mutex_unlock(&log->fs_info->pinned_mutex);
 	}
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
@@ -665,7 +603,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 
 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
 	BUG_ON(ret);
-
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 	BUG_ON(ret);
 	kfree(name);
@@ -867,7 +804,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(root, path);
-
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
@@ -986,20 +922,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		key.offset--;
 		btrfs_release_path(root, path);
 	}
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	if (nlink != inode->i_nlink) {
 		inode->i_nlink = nlink;
 		btrfs_update_inode(trans, root, inode);
 	}
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 
-	if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
-		ret = replay_dir_deletes(trans, root, NULL, path,
-					 inode->i_ino, 1);
-		BUG_ON(ret);
-	}
-	btrfs_free_path(path);
-
 	return 0;
 }
 
@@ -1042,12 +971,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 
 		iput(inode);
 
-		/*
-		 * fixup on a directory may create new entries,
-		 * make sure we always look for the highset possible
-		 * offset
-		 */
-		key.offset = (u64)-1;
+		if (key.offset == 0)
+			break;
+		key.offset--;
 	}
 	btrfs_release_path(root, path);
 	return 0;
@@ -1387,11 +1313,11 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
 				  name_len);
 		log_di = NULL;
-		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
 						       dir_key->objectid,
 						       name, name_len, 0);
-		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
 			log_di = btrfs_lookup_dir_index_item(trans, log,
 						     log_path,
 						     dir_key->objectid,
@@ -1452,7 +1378,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
-				       u64 dirid, int del_all)
+				       u64 dirid)
 {
 	u64 range_start;
 	u64 range_end;
@@ -1482,14 +1408,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 	range_start = 0;
 	range_end = 0;
 	while (1) {
-		if (del_all)
-			range_end = (u64)-1;
-		else {
-			ret = find_dir_range(log, path, dirid, key_type,
-					     &range_start, &range_end);
-			if (ret != 0)
-				break;
-		}
+		ret = find_dir_range(log, path, dirid, key_type,
+				     &range_start, &range_end);
+		if (ret != 0)
+			break;
 
 		dir_key.offset = range_start;
 		while (1) {
@@ -1515,8 +1437,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 				break;
 
 			ret = check_item_in_log(trans, root, log, path,
-						log_path, dir,
-						&found_key);
+						log_path, dir, &found_key);
 			BUG_ON(ret);
 			if (found_key.offset == (u64)-1)
 				break;
@@ -1593,7 +1514,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			mode = btrfs_inode_mode(eb, inode_item);
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
-					 root, log, path, key.objectid, 0);
+					 root, log, path, key.objectid);
 				BUG_ON(ret);
 			}
 			ret = overwrite_item(wc->trans, root, path,
@@ -1612,17 +1533,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 					root, inode, inode->i_size,
 					BTRFS_EXTENT_DATA_KEY);
 				BUG_ON(ret);
-
-				/* if the nlink count is zero here, the iput
-				 * will free the inode.  We bump it to make
-				 * sure it doesn't get freed until the link
-				 * count fixup is done
-				 */
-				if (inode->i_nlink == 0) {
-					btrfs_inc_nlink(inode);
-					btrfs_update_inode(wc->trans,
-							   root, inode);
-				}
 				iput(inode);
 			}
 			ret = link_to_fixup_dir(wc->trans, root,
@@ -1930,8 +1840,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -1945,12 +1854,9 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 		prepare_to_wait(&root->log_commit_wait[index],
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-
-		if (root->fs_info->last_trans_log_full_commit !=
-		    trans->transid && root->log_transid < transid + 2 &&
+		if (root->log_transid < transid + 2 &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
-
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
 	} while (root->log_transid < transid + 2 &&
@@ -1958,16 +1864,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int wait_for_writer(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
 	while (atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (root->fs_info->last_trans_log_full_commit !=
-		    trans->transid && atomic_read(&root->log_writers))
+		if (atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
@@ -1978,14 +1882,7 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
 /*
  * btrfs_sync_log does sends a given tree log down to the disk and
  * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
- *
- * Any other return value means you need to call btrfs_commit_transaction.
- * Some of the edge cases for fsyncing directories that have had unlinks
- * or renames done in the past mean that sometimes the only safe
- * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
- * that has happened.
+ * you know that any inodes previously logged are safely on disk
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root)
@@ -1999,7 +1896,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->log_mutex);
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(trans, root, root->log_transid);
+		wait_log_commit(root, root->log_transid);
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
@@ -2007,26 +1904,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(trans, root, root->log_transid - 1);
+		wait_log_commit(root, root->log_transid - 1);
 
 	while (1) {
 		unsigned long batch = root->log_batch;
 		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&root->log_mutex);
-
-		wait_for_writer(trans, root);
+		wait_for_writer(root);
 		if (batch == root->log_batch)
 			break;
 	}
 
-	/* bail out if we need to do a full commit */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
-		ret = -EAGAIN;
-		mutex_unlock(&root->log_mutex);
-		goto out;
-	}
-
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
@@ -2062,29 +1951,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
-		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid);
+		wait_log_commit(log_root_tree, log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
-	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid - 1);
-	}
-
-	wait_for_writer(trans, log_root_tree);
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+		wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
 
-	/*
-	 * now that we've moved on to the tree of log tree roots,
-	 * check the full commit flag again
-	 */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
-		mutex_unlock(&log_root_tree->log_mutex);
-		ret = -EAGAIN;
-		goto out_wake_log_root;
-	}
+	wait_for_writer(log_root_tree);
 
 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
 				&log_root_tree->dirty_log_pages);
@@ -2109,9 +1985,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * in and cause problems either.
 	 */
 	write_ctree_super(trans, root->fs_info->tree_root, 2);
-	ret = 0;
 
-out_wake_log_root:
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -2124,8 +1998,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-/*
- * free all the extents used by the tree log.  This should be called
+/* * free all the extents used by the tree log.  This should be called
  * at commit time of the full transaction
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2259,7 +2132,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
 	btrfs_free_path(path);
 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
-	btrfs_end_log_trans(root);
+	end_log_trans(root);
 
 	return 0;
 }
@@ -2286,7 +2159,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
 				  dirid, &index);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
-	btrfs_end_log_trans(root);
+	end_log_trans(root);
 
 	return ret;
 }
@@ -2686,7 +2559,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
  *
  * This handles both files and directories.
  */
-static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only)
 {
@@ -2712,17 +2585,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	min_key.offset = 0;
 
 	max_key.objectid = inode->i_ino;
-
-	/* today the code can only do partial logging of directories */
-	if (!S_ISDIR(inode->i_mode))
-	    inode_only = LOG_INODE_ALL;
-
 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
+	/*
+	 * if this inode has already been logged and we're in inode_only
+	 * mode, we don't want to delete the things that have already
+	 * been written to the log.
+	 *
+	 * But, if the inode has been through an inode_only log,
+	 * the logged_trans field is not set.  This allows us to catch
+	 * any new names for this inode in the backrefs by logging it
+	 * again
+	 */
+	if (inode_only == LOG_INODE_EXISTS &&
+	    BTRFS_I(inode)->logged_trans == trans->transid) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		goto out;
+	}
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
 	/*
@@ -2809,6 +2693,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
+		BTRFS_I(inode)->log_dirty_trans = 0;
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
@@ -2817,69 +2702,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
+out:
 	return 0;
 }
 
-/*
- * follow the dentry parent pointers up the chain and see if any
- * of the directories in it require a full commit before they can
- * be logged.  Returns zero if nothing special needs to be done or 1 if
- * a full commit is required.
- */
-static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
-					       struct inode *inode,
-					       struct dentry *parent,
-					       struct super_block *sb,
-					       u64 last_committed)
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only)
 {
-	int ret = 0;
-	struct btrfs_root *root;
-
-	/*
-	 * for regular files, if its inode is already on disk, we don't
-	 * have to worry about the parents at all.  This is because
-	 * we can use the last_unlink_trans field to record renames
-	 * and other fun in this file.
-	 */
-	if (S_ISREG(inode->i_mode) &&
-	    BTRFS_I(inode)->generation <= last_committed &&
-	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
-			goto out;
-
-	if (!S_ISDIR(inode->i_mode)) {
-		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
-			goto out;
-		inode = parent->d_inode;
-	}
-
-	while (1) {
-		BTRFS_I(inode)->logged_trans = trans->transid;
-		smp_mb();
-
-		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
-			root = BTRFS_I(inode)->root;
-
-			/*
-			 * make sure any commits to the log are forced
-			 * to be full commits
-			 */
-			root->fs_info->last_trans_log_full_commit =
-				trans->transid;
-			ret = 1;
-			break;
-		}
-
-		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
-			break;
-
-		if (parent == sb->s_root)
-			break;
-
-		parent = parent->d_parent;
-		inode = parent->d_inode;
+	int ret;
 
-	}
-out:
+	start_log_trans(trans, root);
+	ret = __btrfs_log_inode(trans, root, inode, inode_only);
+	end_log_trans(root);
 	return ret;
 }
 
@@ -2889,65 +2724,31 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only)
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry)
 {
-	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+	int inode_only = LOG_INODE_ALL;
 	struct super_block *sb;
-	int ret = 0;
-	u64 last_committed = root->fs_info->last_trans_committed;
-
-	sb = inode->i_sb;
-
-	if (root->fs_info->last_trans_log_full_commit >
-	    root->fs_info->last_trans_committed) {
-		ret = 1;
-		goto end_no_trans;
-	}
-
-	ret = check_parent_dirs_for_sync(trans, inode, parent,
-					 sb, last_committed);
-	if (ret)
-		goto end_no_trans;
+	int ret;
 
 	start_log_trans(trans, root);
-
-	ret = btrfs_log_inode(trans, root, inode, inode_only);
-	BUG_ON(ret);
-
-	/*
-	 * for regular files, if its inode is already on disk, we don't
-	 * have to worry about the parents at all.  This is because
-	 * we can use the last_unlink_trans field to record renames
-	 * and other fun in this file.
-	 */
-	if (S_ISREG(inode->i_mode) &&
-	    BTRFS_I(inode)->generation <= last_committed &&
-	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
-			goto no_parent;
-
-	inode_only = LOG_INODE_EXISTS;
+	sb = dentry->d_inode->i_sb;
 	while (1) {
-		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
-			break;
+		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+					inode_only);
+		BUG_ON(ret);
+		inode_only = LOG_INODE_EXISTS;
 
-		inode = parent->d_inode;
-		if (BTRFS_I(inode)->generation >
-		    root->fs_info->last_trans_committed) {
-			ret = btrfs_log_inode(trans, root, inode, inode_only);
-			BUG_ON(ret);
-		}
-		if (parent == sb->s_root)
+		dentry = dentry->d_parent;
+		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
 			break;
 
-		parent = parent->d_parent;
+		if (BTRFS_I(dentry->d_inode)->generation <=
+		    root->fs_info->last_trans_committed)
+			break;
 	}
-no_parent:
-	ret = 0;
-	btrfs_end_log_trans(root);
-end_no_trans:
-	return ret;
+	end_log_trans(root);
+	return 0;
 }
 
 /*
@@ -2959,8 +2760,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry)
 {
-	return btrfs_log_inode_parent(trans, root, dentry->d_inode,
-				      dentry->d_parent, 0);
+	u64 gen;
+	gen = root->fs_info->last_trans_new_blockgroup;
+	if (gen > root->fs_info->last_trans_committed)
+		return 1;
+	else
+		return btrfs_log_dentry(trans, root, dentry);
 }
 
 /*
@@ -3079,94 +2884,3 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	kfree(log_root_tree);
 	return 0;
 }
-
-/*
- * there are some corner cases where we want to force a full
- * commit instead of allowing a directory to be logged.
- *
- * They revolve around files there were unlinked from the directory, and
- * this function updates the parent directory so that a full commit is
- * properly done if it is fsync'd later after the unlinks are done.
- */
-void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
-			     struct inode *dir, struct inode *inode,
-			     int for_rename)
-{
-	/*
-	 * when we're logging a file, if it hasn't been renamed
-	 * or unlinked, and its inode is fully committed on disk,
-	 * we don't have to worry about walking up the directory chain
-	 * to log its parents.
-	 *
-	 * So, we use the last_unlink_trans field to put this transid
-	 * into the file.  When the file is logged we check it and
-	 * don't log the parents if the file is fully on disk.
-	 */
-	if (S_ISREG(inode->i_mode))
-		BTRFS_I(inode)->last_unlink_trans = trans->transid;
-
-	/*
-	 * if this directory was already logged any new
-	 * names for this file/dir will get recorded
-	 */
-	smp_mb();
-	if (BTRFS_I(dir)->logged_trans == trans->transid)
-		return;
-
-	/*
-	 * if the inode we're about to unlink was logged,
-	 * the log will be properly updated for any new names
-	 */
-	if (BTRFS_I(inode)->logged_trans == trans->transid)
-		return;
-
-	/*
-	 * when renaming files across directories, if the directory
-	 * there we're unlinking from gets fsync'd later on, there's
-	 * no way to find the destination directory later and fsync it
-	 * properly.  So, we have to be conservative and force commits
-	 * so the new name gets discovered.
-	 */
-	if (for_rename)
-		goto record;
-
-	/* we can safely do the unlink without any special recording */
-	return;
-
-record:
-	BTRFS_I(dir)->last_unlink_trans = trans->transid;
-}
-
-/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
- *
- * It will return zero if all goes well, and it will return 1 if a
- * full transaction commit is required.
- */
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
-			struct inode *inode, struct inode *old_dir,
-			struct dentry *parent)
-{
-	struct btrfs_root * root = BTRFS_I(inode)->root;
-
-	/*
-	 * this will force the logging code to walk the dentry chain
-	 * up for the file
-	 */
-	if (S_ISREG(inode->i_mode))
-		BTRFS_I(inode)->last_unlink_trans = trans->transid;
-
-	/*
-	 * if this inode hasn't been logged and directory we're renaming it
-	 * from hasn't been logged, we don't need to log it
-	 */
-	if (BTRFS_I(inode)->logged_trans <=
-	    root->fs_info->last_trans_committed &&
-	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
-		    root->fs_info->last_trans_committed))
-		return 0;
-
-	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
-}
-
diff --git a/trunk/fs/btrfs/tree-log.h b/trunk/fs/btrfs/tree-log.h
index d09c7609e16b..b9409b32ed02 100644
--- a/trunk/fs/btrfs/tree-log.h
+++ b/trunk/fs/btrfs/tree-log.h
@@ -22,9 +22,14 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 const char *name, int name_len,
@@ -33,16 +38,4 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       const char *name, int name_len,
 			       struct inode *inode, u64 dirid);
-int btrfs_join_running_log_trans(struct btrfs_root *root);
-int btrfs_end_log_trans(struct btrfs_root *root);
-int btrfs_pin_log_trans(struct btrfs_root *root);
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only);
-void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
-			     struct inode *dir, struct inode *inode,
-			     int for_rename);
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
-			struct inode *inode, struct inode *old_dir,
-			struct dentry *parent);
 #endif
diff --git a/trunk/fs/ext4/balloc.c b/trunk/fs/ext4/balloc.c
index 53c72ad85877..38f40d55899c 100644
--- a/trunk/fs/ext4/balloc.c
+++ b/trunk/fs/ext4/balloc.c
@@ -55,8 +55,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-				       ext4_group_t block_group,
-				       struct ext4_group_desc *gdp)
+				ext4_group_t block_group)
 {
 	ext4_fsblk_t tmp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -64,6 +63,10 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	int used_blocks = sbi->s_itb_per_group + 2;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		struct ext4_group_desc *gdp;
+		struct buffer_head *bh;
+
+		gdp = ext4_get_group_desc(sb, block_group, &bh);
 		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
 					block_group))
 			used_blocks--;
@@ -174,7 +177,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
 }
 
 
@@ -470,8 +473,9 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(blocks_freed,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 	/*
 	 * request to reload the buddy with the
diff --git a/trunk/fs/ext4/dir.c b/trunk/fs/ext4/dir.c
index b64789929a65..2df2e40b01af 100644
--- a/trunk/fs/ext4/dir.c
+++ b/trunk/fs/ext4/dir.c
@@ -67,8 +67,7 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 unsigned int offset)
 {
 	const char *error_msg = NULL;
-	const int rlen = ext4_rec_len_from_disk(de->rec_len,
-						dir->i_sb->s_blocksize);
+	const int rlen = ext4_rec_len_from_disk(de->rec_len);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -179,11 +178,10 @@ static int ext4_readdir(struct file *filp,
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len,
-					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len)
+						< EXT4_DIR_REC_LEN(1))
 					break;
-				i += ext4_rec_len_from_disk(de->rec_len,
-							    sb->s_blocksize);
+				i += ext4_rec_len_from_disk(de->rec_len);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -205,8 +203,7 @@ static int ext4_readdir(struct file *filp,
 				ret = stored;
 				goto out;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len,
-					sb->s_blocksize);
+			offset += ext4_rec_len_from_disk(de->rec_len);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -228,8 +225,7 @@ static int ext4_readdir(struct file *filp,
 					goto revalidate;
 				stored++;
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-						sb->s_blocksize);
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
 		brelse(bh);
diff --git a/trunk/fs/ext4/ext4.h b/trunk/fs/ext4/ext4.h
index d0f15ef56de1..990c94000924 100644
--- a/trunk/fs/ext4/ext4.h
+++ b/trunk/fs/ext4/ext4.h
@@ -32,6 +32,14 @@
  */
 #undef EXT4FS_DEBUG
 
+/*
+ * Define EXT4_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT4_DEFAULT_RESERVE_BLOCKS	8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT4_MAX_RESERVE_BLOCKS		1027
+#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
+
 /*
  * Debug code
  */
@@ -46,6 +54,8 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
+#define EXT4_MULTIBLOCK_ALLOCATOR	1
+
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
@@ -170,9 +180,8 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-	atomic_t free_inodes;
-	atomic_t free_blocks;
-	atomic_t used_dirs;
+	__u32 free_inodes;
+	__u32 free_blocks;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
@@ -240,30 +249,6 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
-/* Flags that should be inherited by new inodes from their parent. */
-#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
-			   EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
-			   EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
-			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
-			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
-
-/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
-
-/* Flags that are appropriate for non-directories/regular files. */
-#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
-
-/* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
-{
-	if (S_ISDIR(mode))
-		return flags;
-	else if (S_ISREG(mode))
-		return flags & EXT4_REG_FLMASK;
-	else
-		return flags & EXT4_OTHER_FLMASK;
-}
-
 /*
  * Inode dynamic state flags
  */
@@ -271,7 +256,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 #define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -319,9 +303,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE		_IO('f', 9)
- /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
-#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -549,7 +531,7 @@ do {									       \
 #define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
-#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
+#define EXT4_MOUNT_RESERVATION		0x10000	/* Preallocation */
 #define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
@@ -684,8 +666,7 @@ struct ext4_super_block {
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
 	__u8	s_reserved_char_pad2;
 	__le16  s_reserved_pad;
-	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
-	__u32   s_reserved[160];        /* Padding to the end of the block */
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -832,12 +813,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MIN_BATCH_TIME	0
 #define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
 
-/*
- * Minimum number of groups in a flexgroup before we separate out
- * directories into the first block group of a flexgroup
- */
-#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
-
 /*
  * Structure of a directory entry
  */
@@ -890,6 +865,24 @@ struct ext4_dir_entry_2 {
 					 ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 
+static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return 1 << 16;
+	return len;
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len)
+{
+	if (len == (1 << 16))
+		return cpu_to_le16(EXT4_MAX_REC_LEN);
+	else if (len > (1 << 16))
+		BUG();
+	return cpu_to_le16(len);
+}
+
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
@@ -977,6 +970,22 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations ext4_ui_proc_fops;
+
+#define	EXT4_PROC_HANDLER(name, var)					\
+do {									\
+	proc = proc_create_data(name, mode, sbi->s_proc,		\
+				&ext4_ui_proc_fops, &sbi->s_##var);	\
+	if (proc == NULL) {						\
+		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
+		goto err_out;						\
+	}								\
+} while (0)
+#else
+#define EXT4_PROC_HANDLER(name, var)
+#endif
+
 /*
  * Function prototypes
  */
@@ -1083,7 +1092,6 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
-extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
@@ -1099,10 +1107,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
-
 /* namei.c */
-extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
-extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/trunk/fs/ext4/ext4_extents.h b/trunk/fs/ext4/ext4_extents.h
index f0c3ec85bd48..18cb67b2cbbc 100644
--- a/trunk/fs/ext4/ext4_extents.h
+++ b/trunk/fs/ext4/ext4_extents.h
@@ -241,6 +241,5 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/trunk/fs/ext4/ext4_i.h b/trunk/fs/ext4/ext4_i.h
index 4ce2187123aa..e69acc16f5c4 100644
--- a/trunk/fs/ext4/ext4_i.h
+++ b/trunk/fs/ext4/ext4_i.h
@@ -33,6 +33,9 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
 /*
  * storage for cached extent
  */
@@ -122,9 +125,6 @@ struct ext4_inode_info {
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 
-	/* ialloc */
-	ext4_group_t	i_last_alloc_group;
-
 	/* allocation reservation info for delalloc */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
diff --git a/trunk/fs/ext4/ext4_sb.h b/trunk/fs/ext4/ext4_sb.h
index 57b71fefbccf..039b6ea1a042 100644
--- a/trunk/fs/ext4/ext4_sb.h
+++ b/trunk/fs/ext4/ext4_sb.h
@@ -62,10 +62,12 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock *s_blockgroup_lock;
+	struct blockgroup_lock s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
-	struct kobject s_kobj;
-	struct completion s_kobj_unregister;
+
+	/* root of the per fs reservation window tree */
+	spinlock_t s_rsv_window_lock;
+	struct rb_root s_rsv_window_root;
 
 	/* Journaling */
 	struct inode *s_journal_inode;
@@ -144,10 +146,6 @@ struct ext4_sb_info {
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
 
-	/* for write statistics */
-	unsigned long s_sectors_written_start;
-	u64 s_kbytes_written;
-
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 };
@@ -155,7 +153,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
 }
 
 #endif	/* _EXT4_SB */
diff --git a/trunk/fs/ext4/extents.c b/trunk/fs/ext4/extents.c
index ac77d8b8251d..e0aa4fe4f596 100644
--- a/trunk/fs/ext4/extents.c
+++ b/trunk/fs/ext4/extents.c
@@ -152,8 +152,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
-	ext4_group_t block_group;
-	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	int depth;
 
 	if (path) {
@@ -172,31 +170,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	}
 
 	/* OK. use inode's group */
-	block_group = ei->i_block_group;
-	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
-		/*
-		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-		 * block groups per flexgroup, reserve the first block 
-		 * group for directories and special files.  Regular 
-		 * files will start at the second block group.  This
-		 * tends to speed up directory access and improves 
-		 * fsck times.
-		 */
-		block_group &= ~(flex_size-1);
-		if (S_ISREG(inode->i_mode))
-			block_group++;
-	}
-	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
-	/*
-	 * If we are doing delayed allocation, we don't need take
-	 * colour into account.
-	 */
-	if (test_opt(inode->i_sb, DELALLOC))
-		return bg_start;
-
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -324,64 +301,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 	return max;
 }
 
-static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
-{
-	ext4_fsblk_t block = ext_pblock(ext);
-	int len = ext4_ext_get_actual_len(ext);
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			((block + len) > ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
-}
-
-static int ext4_valid_extent_idx(struct inode *inode,
-				struct ext4_extent_idx *ext_idx)
-{
-	ext4_fsblk_t block = idx_pblock(ext_idx);
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			(block > ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
-}
-
-static int ext4_valid_extent_entries(struct inode *inode,
-				struct ext4_extent_header *eh,
-				int depth)
-{
-	struct ext4_extent *ext;
-	struct ext4_extent_idx *ext_idx;
-	unsigned short entries;
-	if (eh->eh_entries == 0)
-		return 1;
-
-	entries = le16_to_cpu(eh->eh_entries);
-
-	if (depth == 0) {
-		/* leaf entries */
-		ext = EXT_FIRST_EXTENT(eh);
-		while (entries) {
-			if (!ext4_valid_extent(inode, ext))
-				return 0;
-			ext++;
-			entries--;
-		}
-	} else {
-		ext_idx = EXT_FIRST_INDEX(eh);
-		while (entries) {
-			if (!ext4_valid_extent_idx(inode, ext_idx))
-				return 0;
-			ext_idx++;
-			entries--;
-		}
-	}
-	return 1;
-}
-
-static int __ext4_ext_check(const char *function, struct inode *inode,
+static int __ext4_ext_check_header(const char *function, struct inode *inode,
 					struct ext4_extent_header *eh,
 					int depth)
 {
@@ -409,15 +329,11 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 		error_msg = "invalid eh_entries";
 		goto corrupted;
 	}
-	if (!ext4_valid_extent_entries(inode, eh, depth)) {
-		error_msg = "invalid extent entries";
-		goto corrupted;
-	}
 	return 0;
 
 corrupted:
 	ext4_error(inode->i_sb, function,
-			"bad header/extent in inode #%lu: %s - magic %x, "
+			"bad header in inode #%lu: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
 			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -426,13 +342,8 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 	return -EIO;
 }
 
-#define ext4_ext_check(inode, eh, depth)	\
-	__ext4_ext_check(__func__, inode, eh, depth)
-
-int ext4_ext_check_inode(struct inode *inode)
-{
-	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
-}
+#define ext4_ext_check_header(inode, eh, depth)	\
+	__ext4_ext_check_header(__func__, inode, eh, depth)
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -636,6 +547,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
+	if (ext4_ext_check_header(inode, eh, depth))
+		return ERR_PTR(-EIO);
+
 
 	/* account possible depth increase */
 	if (!path) {
@@ -651,8 +565,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	i = depth;
 	/* walk through the tree */
 	while (i) {
-		int need_to_validate = 0;
-
 		ext_debug("depth %d: num %d, max %d\n",
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
@@ -661,17 +573,10 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
-		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
-		if (unlikely(!bh))
+		bh = sb_bread(inode->i_sb, path[ppos].p_block);
+		if (!bh)
 			goto err;
-		if (!bh_uptodate_or_lock(bh)) {
-			if (bh_submit_read(bh) < 0) {
-				put_bh(bh);
-				goto err;
-			}
-			/* validate the extent entries */
-			need_to_validate = 1;
-		}
+
 		eh = ext_block_hdr(bh);
 		ppos++;
 		BUG_ON(ppos > depth);
@@ -679,7 +584,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (need_to_validate && ext4_ext_check(inode, eh, i))
+		if (ext4_ext_check_header(inode, eh, i))
 			goto err;
 	}
 
@@ -1276,7 +1181,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 			return -EIO;
 		eh = ext_block_hdr(bh);
 		/* subtract from p_depth to get proper eh_depth */
-		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+		if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
 			put_bh(bh);
 			return -EIO;
 		}
@@ -1289,7 +1194,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	if (bh == NULL)
 		return -EIO;
 	eh = ext_block_hdr(bh);
-	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
 		put_bh(bh);
 		return -EIO;
 	}
@@ -2232,7 +2137,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		return -ENOMEM;
 	}
 	path[0].p_hdr = ext_inode_hdr(inode);
-	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
@@ -2286,7 +2191,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 				err = -EIO;
 				break;
 			}
-			if (ext4_ext_check(inode, ext_block_hdr(bh),
+			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
 							depth - i - 1)) {
 				err = -EIO;
 				break;
diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c
index 588af8c77246..f731cb545a03 100644
--- a/trunk/fs/ext4/file.c
+++ b/trunk/fs/ext4/file.c
@@ -33,14 +33,9 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
-	if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
-		ext4_alloc_da_blocks(inode);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
-	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1) &&
-		        !EXT4_I(inode)->i_reserved_data_blocks)
+			(atomic_read(&inode->i_writecount) == 1))
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
 		ext4_discard_preallocations(inode);
diff --git a/trunk/fs/ext4/ialloc.c b/trunk/fs/ext4/ialloc.c
index 47b84e8df568..fb51b40e3e8f 100644
--- a/trunk/fs/ext4/ialloc.c
+++ b/trunk/fs/ext4/ialloc.c
@@ -189,6 +189,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err, count, cleared;
+	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -267,13 +268,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			if (is_directory) {
 				count = ext4_used_dirs_count(sb, gdp) - 1;
 				ext4_used_dirs_set(sb, gdp, count);
-				if (sbi->s_log_groups_per_flex) {
-					ext4_group_t f;
-
-					f = ext4_flex_group(sbi, block_group);
-					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
-				}
-
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
@@ -283,10 +277,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
 			if (sbi->s_log_groups_per_flex) {
-				ext4_group_t f;
-
-				f = ext4_flex_group(sbi, block_group);
-				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+				flex_group = ext4_flex_group(sbi, block_group);
+				spin_lock(sb_bgl_lock(sbi, flex_group));
+				sbi->s_flex_groups[flex_group].free_inodes++;
+				spin_unlock(sb_bgl_lock(sbi, flex_group));
 			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -366,9 +360,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
-	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
+	flexbg_free_blocks = flex_group[best_flex].free_blocks;
 	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-	if (atomic_read(&flex_group[best_flex].free_inodes) &&
+	if (flex_group[best_flex].free_inodes &&
 	    flex_freeb_ratio > free_block_ratio)
 		goto found_flexbg;
 
@@ -381,24 +375,24 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 		if (i == parent_fbg_group || i == parent_fbg_group - 1)
 			continue;
 
-		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
+		flexbg_free_blocks = flex_group[i].free_blocks;
 		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 
 		if (flex_freeb_ratio > free_block_ratio &&
-		    (atomic_read(&flex_group[i].free_inodes))) {
+		    flex_group[i].free_inodes) {
 			best_flex = i;
 			goto found_flexbg;
 		}
 
-		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-		    ((atomic_read(&flex_group[i].free_blocks) >
-		      atomic_read(&flex_group[best_flex].free_blocks)) &&
-		     atomic_read(&flex_group[i].free_inodes)))
+		if (flex_group[best_flex].free_inodes == 0 ||
+		    (flex_group[i].free_blocks >
+		     flex_group[best_flex].free_blocks &&
+		     flex_group[i].free_inodes))
 			best_flex = i;
 	}
 
-	if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-	    !atomic_read(&flex_group[best_flex].free_blocks))
+	if (!flex_group[best_flex].free_inodes ||
+	    !flex_group[best_flex].free_blocks)
 		return -1;
 
 found_flexbg:
@@ -416,42 +410,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	return 0;
 }
 
-struct orlov_stats {
-	__u32 free_inodes;
-	__u32 free_blocks;
-	__u32 used_dirs;
-};
-
-/*
- * Helper function for Orlov's allocator; returns critical information
- * for a particular block group or flex_bg.  If flex_size is 1, then g
- * is a block group number; otherwise it is flex_bg number.
- */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-		       int flex_size, struct orlov_stats *stats)
-{
-	struct ext4_group_desc *desc;
-	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
-
-	if (flex_size > 1) {
-		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-		stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
-		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
-		return;
-	}
-
-	desc = ext4_get_group_desc(sb, g, NULL);
-	if (desc) {
-		stats->free_inodes = ext4_free_inodes_count(sb, desc);
-		stats->free_blocks = ext4_free_blks_count(sb, desc);
-		stats->used_dirs = ext4_used_dirs_count(sb, desc);
-	} else {
-		stats->free_inodes = 0;
-		stats->free_blocks = 0;
-		stats->used_dirs = 0;
-	}
-}
-
 /*
  * Orlov's allocator for directories.
  *
@@ -467,34 +425,35 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
+ * it's already running too large debt (max_debt).
  * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
  */
 
+#define INODE_COST 64
+#define BLOCK_COST 256
+
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-			    ext4_group_t *group, int mode)
+				ext4_group_t *group)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
+	ext4_fsblk_t blocks_per_dir;
 	unsigned int ndirs;
-	int max_dirs, min_inodes;
+	int max_debt, max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i, grp, g;
+	ext4_group_t i;
 	struct ext4_group_desc *desc;
-	struct orlov_stats stats;
-	int flex_size = ext4_flex_bg_size(sbi);
-
-	if (flex_size > 1) {
-		ngroups = (ngroups + flex_size - 1) >>
-			sbi->s_log_groups_per_flex;
-		parent_group >>= sbi->s_log_groups_per_flex;
-	}
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
@@ -503,97 +462,71 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	do_div(avefreeb, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-	if (S_ISDIR(mode) &&
-	    ((parent == sb->s_root->d_inode) ||
-	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+	if ((parent == sb->s_root->d_inode) ||
+	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
 		int best_ndir = inodes_per_group;
+		ext4_group_t grp;
 		int ret = -1;
 
 		get_random_bytes(&grp, sizeof(grp));
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
-			g = (parent_group + i) % ngroups;
-			get_orlov_stats(sb, g, flex_size, &stats);
-			if (!stats.free_inodes)
+			grp = (parent_group + i) % ngroups;
+			desc = ext4_get_group_desc(sb, grp, NULL);
+			if (!desc || !ext4_free_inodes_count(sb, desc))
 				continue;
-			if (stats.used_dirs >= best_ndir)
+			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
 				continue;
-			if (stats.free_inodes < avefreei)
+			if (ext4_free_inodes_count(sb, desc) < avefreei)
 				continue;
-			if (stats.free_blocks < avefreeb)
+			if (ext4_free_blks_count(sb, desc) < avefreeb)
 				continue;
-			grp = g;
-			ret = 0;
-			best_ndir = stats.used_dirs;
-		}
-		if (ret)
-			goto fallback;
-	found_flex_bg:
-		if (flex_size == 1) {
 			*group = grp;
-			return 0;
-		}
-
-		/*
-		 * We pack inodes at the beginning of the flexgroup's
-		 * inode tables.  Block allocation decisions will do
-		 * something similar, although regular files will
-		 * start at 2nd block group of the flexgroup.  See
-		 * ext4_ext_find_goal() and ext4_find_near().
-		 */
-		grp *= flex_size;
-		for (i = 0; i < flex_size; i++) {
-			if (grp+i >= sbi->s_groups_count)
-				break;
-			desc = ext4_get_group_desc(sb, grp+i, NULL);
-			if (desc && ext4_free_inodes_count(sb, desc)) {
-				*group = grp+i;
-				return 0;
-			}
+			ret = 0;
+			best_ndir = ext4_used_dirs_count(sb, desc);
 		}
+		if (ret == 0)
+			return ret;
 		goto fallback;
 	}
 
-	max_dirs = ndirs / ngroups + inodes_per_group / 16;
-	min_inodes = avefreei - inodes_per_group*flex_size / 4;
-	if (min_inodes < 1)
-		min_inodes = 1;
-	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+	blocks_per_dir = ext4_blocks_count(es) - freeb;
+	do_div(blocks_per_dir, ndirs);
 
-	/*
-	 * Start looking in the flex group where we last allocated an
-	 * inode for this parent directory
-	 */
-	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
-		parent_group = EXT4_I(parent)->i_last_alloc_group;
-		if (flex_size > 1)
-			parent_group >>= sbi->s_log_groups_per_flex;
-	}
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group / 4;
+	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
+
+	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
+	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
+	if (max_debt * INODE_COST > inodes_per_group)
+		max_debt = inodes_per_group / INODE_COST;
+	if (max_debt > 255)
+		max_debt = 255;
+	if (max_debt == 0)
+		max_debt = 1;
 
 	for (i = 0; i < ngroups; i++) {
-		grp = (parent_group + i) % ngroups;
-		get_orlov_stats(sb, grp, flex_size, &stats);
-		if (stats.used_dirs >= max_dirs)
+		*group = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
+		if (!desc || !ext4_free_inodes_count(sb, desc))
+			continue;
+		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
 			continue;
-		if (stats.free_inodes < min_inodes)
+		if (ext4_free_inodes_count(sb, desc) < min_inodes)
 			continue;
-		if (stats.free_blocks < min_blocks)
+		if (ext4_free_blks_count(sb, desc) < min_blocks)
 			continue;
-		goto found_flex_bg;
+		return 0;
 	}
 
 fallback:
-	ngroups = sbi->s_groups_count;
-	avefreei = freei / ngroups;
-	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
-		grp = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, grp, NULL);
+		*group = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
 		if (desc && ext4_free_inodes_count(sb, desc) &&
-		    ext4_free_inodes_count(sb, desc) >= avefreei) {
-			*group = grp;
+			ext4_free_inodes_count(sb, desc) >= avefreei)
 			return 0;
-		}
 	}
 
 	if (avefreei) {
@@ -609,51 +542,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-			    ext4_group_t *group, int mode)
+				ext4_group_t *group)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *desc;
-	ext4_group_t i, last;
-	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
-
-	/*
-	 * Try to place the inode is the same flex group as its
-	 * parent.  If we can't find space, use the Orlov algorithm to
-	 * find another flex group, and store that information in the
-	 * parent directory's inode information so that use that flex
-	 * group for future allocations.
-	 */
-	if (flex_size > 1) {
-		int retry = 0;
-
-	try_again:
-		parent_group &= ~(flex_size-1);
-		last = parent_group + flex_size;
-		if (last > ngroups)
-			last = ngroups;
-		for  (i = parent_group; i < last; i++) {
-			desc = ext4_get_group_desc(sb, i, NULL);
-			if (desc && ext4_free_inodes_count(sb, desc)) {
-				*group = i;
-				return 0;
-			}
-		}
-		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
-			retry = 1;
-			parent_group = EXT4_I(parent)->i_last_alloc_group;
-			goto try_again;
-		}
-		/*
-		 * If this didn't work, use the Orlov search algorithm
-		 * to find a new flex group; we pass in the mode to
-		 * avoid the topdir algorithms.
-		 */
-		*group = parent_group + flex_size;
-		if (*group > ngroups)
-			*group = 0;
-		return find_group_orlov(sb, parent, group, mode);
-	}
+	ext4_group_t i;
 
 	/*
 	 * Try to place the inode in its parent directory
@@ -771,11 +665,6 @@ static int ext4_claim_inode(struct super_block *sb,
 	if (S_ISDIR(mode)) {
 		count = ext4_used_dirs_count(sb, gdp) + 1;
 		ext4_used_dirs_set(sb, gdp, count);
-		if (sbi->s_log_groups_per_flex) {
-			ext4_group_t f = ext4_flex_group(sbi, group);
-
-			atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-		}
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
@@ -827,10 +716,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 
-	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
+	if (sbi->s_log_groups_per_flex) {
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
-			ret2 = find_group_other(sb, dir, &group, mode);
+			ret2 = find_group_other(sb, dir, &group);
 			if (ret2 == 0 && once)
 				once = 0;
 				printk(KERN_NOTICE "ext4: find_group_flex "
@@ -844,12 +733,11 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
-			ret2 = find_group_orlov(sb, dir, &group, mode);
+			ret2 = find_group_orlov(sb, dir, &group);
 	} else
-		ret2 = find_group_other(sb, dir, &group, mode);
+		ret2 = find_group_other(sb, dir, &group);
 
 got_group:
-	EXT4_I(dir)->i_last_alloc_group = group;
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -970,7 +858,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 
 	if (sbi->s_log_groups_per_flex) {
 		flex_group = ext4_flex_group(sbi, group);
-		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_inodes--;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 
 	inode->i_uid = current_fsuid();
@@ -995,16 +885,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	ei->i_disksize = 0;
 
 	/*
-	 * Don't inherit extent flag from directory, amongst others. We set
-	 * extent flag on newly created directory and file only if -o extent
-	 * mount option is specified
+	 * Don't inherit extent flag from directory. We set extent flag on
+	 * newly created directory and file only if -o extent mount option is
+	 * specified
 	 */
-	ei->i_flags =
-		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+	ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
+	if (S_ISLNK(mode))
+		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
+	/* dirsync only applies to directories */
+	if (!S_ISDIR(mode))
+		ei->i_flags &= ~EXT4_DIRSYNC_FL;
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_group = group;
-	ei->i_last_alloc_group = ~0;
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c
index a2e7952bc5f9..dd82ff390067 100644
--- a/trunk/fs/ext4/inode.c
+++ b/trunk/fs/ext4/inode.c
@@ -371,34 +371,6 @@ static int ext4_block_to_path(struct inode *inode,
 	return n;
 }
 
-static int __ext4_check_blockref(const char *function, struct inode *inode,
-				 unsigned int *p, unsigned int max) {
-
-	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
-	unsigned int *bref = p;
-	while (bref < p+max) {
-		if (unlikely(*bref >= maxblocks)) {
-			ext4_error(inode->i_sb, function,
-				   "block reference %u >= max (%u) "
-				   "in inode #%lu, offset=%d",
-				   *bref, maxblocks,
-				   inode->i_ino, (int)(bref-p));
- 			return -EIO;
- 		}
-		bref++;
- 	}
- 	return 0;
-}
-
-
-#define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
-			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
-
-#define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
-			      EXT4_NDIR_BLOCKS)
-
 /**
  *	ext4_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
@@ -443,22 +415,9 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
-		bh = sb_getblk(sb, le32_to_cpu(p->key));
-		if (unlikely(!bh))
+		bh = sb_bread(sb, le32_to_cpu(p->key));
+		if (!bh)
 			goto failure;
-                  
-		if (!bh_uptodate_or_lock(bh)) {
-			if (bh_submit_read(bh) < 0) {
-				put_bh(bh);
-				goto failure;
-			}
-			/* validate block references */
-			if (ext4_check_indirect_blockref(inode, bh)) {
-				put_bh(bh);
-				goto failure;
-			}
-		}
-		
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
@@ -500,8 +459,6 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
-	ext4_group_t block_group;
-	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -517,22 +474,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	block_group = ei->i_block_group;
-	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
-		block_group &= ~(flex_size-1);
-		if (S_ISREG(inode->i_mode))
-			block_group++;
-	}
-	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
-	/*
-	 * If we are doing delayed allocation, we don't need take
-	 * colour into account.
-	 */
-	if (test_opt(inode->i_sb, DELALLOC))
-		return bg_start;
-
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1108,16 +1052,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 	/*
 	 * free those over-booking quota for metadata blocks
 	 */
+
 	if (mdb_free)
 		vfs_dq_release_reservation_block(inode, mdb_free);
-
-	/*
-	 * If we have done all the pending block allocations and if
-	 * there aren't any writers on the inode, we can discard the
-	 * inode's preallocations.
-	 */
-	if (!total && (atomic_read(&inode->i_writecount) == 0))
-		ext4_discard_preallocations(inode);
 }
 
 /*
@@ -1751,10 +1688,9 @@ static void ext4_da_page_release_reservation(struct page *page,
 
 struct mpage_da_data {
 	struct inode *inode;
-	sector_t b_blocknr;		/* start block number of extent */
-	size_t b_size;			/* size of extent */
-	unsigned long b_state;		/* state of the extent */
+	struct buffer_head lbh;			/* extent of blocks */
 	unsigned long first_page, next_page;	/* extent of pages */
+	get_block_t *get_block;
 	struct writeback_control *wbc;
 	int io_done;
 	int pages_written;
@@ -1768,6 +1704,7 @@ struct mpage_da_data {
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
  * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
  *
  * By the time mpage_da_submit_io() is called we expect all blocks
  * to be allocated. this may be wrong if allocation failed.
@@ -1787,7 +1724,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
-	 * If we look at mpd->b_blocknr we would only be looking
+	 * If we look at mpd->lbh.b_blocknr we would only be looking
 	 * at the currently mapped buffer_heads.
 	 */
 	index = mpd->first_page;
@@ -1977,111 +1914,68 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-#define		EXT4_DELALLOC_RSVED	1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
-{
-	int ret;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	loff_t disksize = EXT4_I(inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-	if (ret <= 0)
-		return ret;
-
-	bh_result->b_size = (ret << inode->i_blkbits);
-
-	if (ext4_should_order_data(inode)) {
-		int retval;
-		retval = ext4_jbd2_file_inode(handle, inode);
-		if (retval)
-			/*
-			 * Failed to add inode for ordered mode. Don't
-			 * update file size
-			 */
-			return retval;
-	}
-
-	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use 'extend_disksize' as size may change within already
-	 * allocated block -bzzz
-	 */
-	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-	if (disksize > i_size_read(inode))
-		disksize = i_size_read(inode);
-	if (disksize > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, disksize);
-		ret = ext4_mark_inode_dirty(handle, inode);
-		return ret;
-	}
-	return 0;
-}
-
 /*
  * mpage_da_map_blocks - go through given space
  *
- * @mpd - bh describing space
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
 	struct buffer_head new;
+	struct buffer_head *lbh = &mpd->lbh;
 	sector_t next;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
 	 */
-	if ((mpd->b_state  & (1 << BH_Mapped)) &&
-	    !(mpd->b_state & (1 << BH_Delay)))
+	if (buffer_mapped(lbh) && !buffer_delay(lbh))
 		return 0;
-	new.b_state = mpd->b_state;
+	new.b_state = lbh->b_state;
 	new.b_blocknr = 0;
-	new.b_size = mpd->b_size;
-	next = mpd->b_blocknr;
+	new.b_size = lbh->b_size;
+	next = lbh->b_blocknr;
 	/*
 	 * If we didn't accumulate anything
 	 * to write simply return
 	 */
 	if (!new.b_size)
 		return 0;
-
-	err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+	err = mpd->get_block(mpd->inode, next, &new, 1);
 	if (err) {
-		/*
-		 * If get block returns with error we simply
-		 * return. Later writepage will redirty the page and
-		 * writepages will find the dirty page again
+
+		/* If get block returns with error
+		 * we simply return. Later writepage
+		 * will redirty the page and writepages
+		 * will find the dirty page again
 		 */
 		if (err == -EAGAIN)
 			return 0;
 
 		if (err == -ENOSPC &&
-		    ext4_count_free_blocks(mpd->inode->i_sb)) {
+				ext4_count_free_blocks(mpd->inode->i_sb)) {
 			mpd->retval = err;
 			return 0;
 		}
 
 		/*
-		 * get block failure will cause us to loop in
-		 * writepages, because a_ops->writepage won't be able
-		 * to make progress. The page will be redirtied by
-		 * writepage and writepages will again try to write
-		 * the same.
+		 * get block failure will cause us
+		 * to loop in writepages. Because
+		 * a_ops->writepage won't be able to
+		 * make progress. The page will be redirtied
+		 * by writepage and writepages will again
+		 * try to write the same.
 		 */
 		printk(KERN_EMERG "%s block allocation failed for inode %lu "
 				  "at logical offset %llu with max blocks "
 				  "%zd with error %d\n",
 				  __func__, mpd->inode->i_ino,
 				  (unsigned long long)next,
-				  mpd->b_size >> mpd->inode->i_blkbits, err);
+				  lbh->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_EMERG "This should not happen.!! "
 					"Data will be lost\n");
 		if (err == -ENOSPC) {
@@ -2089,7 +1983,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		}
 		/* invlaidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
-				mpd->b_size >> mpd->inode->i_blkbits);
+				lbh->b_size >> mpd->inode->i_blkbits);
 		return err;
 	}
 	BUG_ON(new.b_size == 0);
@@ -2101,8 +1995,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * If blocks are delayed marked, we need to
 	 * put actual blocknr and drop delayed bit
 	 */
-	if ((mpd->b_state & (1 << BH_Delay)) ||
-	    (mpd->b_state & (1 << BH_Unwritten)))
+	if (buffer_delay(lbh) || buffer_unwritten(lbh))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
 	return 0;
@@ -2121,11 +2014,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
  * the function is used to collect contig. blocks in same state
  */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-				   sector_t logical, size_t b_size,
-				   unsigned long b_state)
+				   sector_t logical, struct buffer_head *bh)
 {
 	sector_t next;
-	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+	size_t b_size = bh->b_size;
+	struct buffer_head *lbh = &mpd->lbh;
+	int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
 
 	/* check if thereserved journal credits might overflow */
 	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2152,19 +2046,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	/*
 	 * First block in the extent
 	 */
-	if (mpd->b_size == 0) {
-		mpd->b_blocknr = logical;
-		mpd->b_size = b_size;
-		mpd->b_state = b_state & BH_FLAGS;
+	if (lbh->b_size == 0) {
+		lbh->b_blocknr = logical;
+		lbh->b_size = b_size;
+		lbh->b_state = bh->b_state & BH_FLAGS;
 		return;
 	}
 
-	next = mpd->b_blocknr + nrblocks;
+	next = lbh->b_blocknr + nrblocks;
 	/*
 	 * Can we merge the block to our big extent?
 	 */
-	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-		mpd->b_size += b_size;
+	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+		lbh->b_size += b_size;
 		return;
 	}
 
@@ -2193,7 +2087,7 @@ static int __mpage_da_writepage(struct page *page,
 {
 	struct mpage_da_data *mpd = data;
 	struct inode *inode = mpd->inode;
-	struct buffer_head *bh, *head;
+	struct buffer_head *bh, *head, fake;
 	sector_t logical;
 
 	if (mpd->io_done) {
@@ -2235,9 +2129,9 @@ static int __mpage_da_writepage(struct page *page,
 		/*
 		 * ... and blocks
 		 */
-		mpd->b_size = 0;
-		mpd->b_state = 0;
-		mpd->b_blocknr = 0;
+		mpd->lbh.b_size = 0;
+		mpd->lbh.b_state = 0;
+		mpd->lbh.b_blocknr = 0;
 	}
 
 	mpd->next_page = page->index + 1;
@@ -2245,8 +2139,16 @@ static int __mpage_da_writepage(struct page *page,
 		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	if (!page_has_buffers(page)) {
-		mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-				       (1 << BH_Dirty) | (1 << BH_Uptodate));
+		/*
+		 * There is no attached buffer heads yet (mmap?)
+		 * we treat the page asfull of dirty blocks
+		 */
+		bh = &fake;
+		bh->b_size = PAGE_CACHE_SIZE;
+		bh->b_state = 0;
+		set_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		mpage_add_bh_to_extent(mpd, logical, bh);
 		if (mpd->io_done)
 			return MPAGE_DA_EXTENT_TAIL;
 	} else {
@@ -2264,10 +2166,8 @@ static int __mpage_da_writepage(struct page *page,
 			 * with the page in ext4_da_writepage
 			 */
 			if (buffer_dirty(bh) &&
-			    (!buffer_mapped(bh) || buffer_delay(bh))) {
-				mpage_add_bh_to_extent(mpd, logical,
-						       bh->b_size,
-						       bh->b_state);
+				(!buffer_mapped(bh) || buffer_delay(bh))) {
+				mpage_add_bh_to_extent(mpd, logical, bh);
 				if (mpd->io_done)
 					return MPAGE_DA_EXTENT_TAIL;
 			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2279,8 +2179,9 @@ static int __mpage_da_writepage(struct page *page,
 				 * unmapped buffer_head later we need to
 				 * use the b_state flag of that buffer_head.
 				 */
-				if (mpd->b_size == 0)
-					mpd->b_state = bh->b_state & BH_FLAGS;
+				if (mpd->lbh.b_size == 0)
+					mpd->lbh.b_state =
+						bh->b_state & BH_FLAGS;
 			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
@@ -2289,6 +2190,51 @@ static int __mpage_da_writepage(struct page *page,
 	return 0;
 }
 
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+			       struct writeback_control *wbc,
+			       struct mpage_da_data *mpd)
+{
+	int ret;
+
+	if (!mpd->get_block)
+		return generic_writepages(mapping, wbc);
+
+	mpd->lbh.b_size = 0;
+	mpd->lbh.b_state = 0;
+	mpd->lbh.b_blocknr = 0;
+	mpd->first_page = 0;
+	mpd->next_page = 0;
+	mpd->io_done = 0;
+	mpd->pages_written = 0;
+	mpd->retval = 0;
+
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
+	/*
+	 * Handle last extent of pages
+	 */
+	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+		if (mpage_da_map_blocks(mpd) == 0)
+			mpage_da_submit_io(mpd);
+
+		mpd->io_done = 1;
+		ret = MPAGE_DA_EXTENT_TAIL;
+	}
+	wbc->nr_to_write -= mpd->pages_written;
+	return ret;
+}
+
 /*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
@@ -2328,6 +2274,51 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
 	return ret;
 }
+#define		EXT4_DELALLOC_RSVED	1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+			bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	if (ret > 0) {
+
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		if (ext4_should_order_data(inode)) {
+			int retval;
+			retval = ext4_jbd2_file_inode(handle, inode);
+			if (retval)
+				/*
+				 * Failed to add inode for ordered
+				 * mode. Don't update file size
+				 */
+				return retval;
+		}
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			ext4_update_i_disksize(inode, disksize);
+			ret = ext4_mark_inode_dirty(handle, inode);
+			return ret;
+		}
+		ret = 0;
+	}
+	return ret;
+}
 
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
@@ -2578,38 +2569,8 @@ static int ext4_da_writepages(struct address_space *mapping,
 			dump_stack();
 			goto out_writepages;
 		}
-
-		/*
-		 * Now call __mpage_da_writepage to find the next
-		 * contiguous region of logical blocks that need
-		 * blocks to be allocated by ext4.  We don't actually
-		 * submit the blocks for I/O here, even though
-		 * write_cache_pages thinks it will, and will set the
-		 * pages as clean for write before calling
-		 * __mpage_da_writepage().
-		 */
-		mpd.b_size = 0;
-		mpd.b_state = 0;
-		mpd.b_blocknr = 0;
-		mpd.first_page = 0;
-		mpd.next_page = 0;
-		mpd.io_done = 0;
-		mpd.pages_written = 0;
-		mpd.retval = 0;
-		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
-					&mpd);
-		/*
-		 * If we have a contigous extent of pages and we
-		 * haven't done the I/O yet, map the blocks and submit
-		 * them for I/O.
-		 */
-		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-			if (mpage_da_map_blocks(&mpd) == 0)
-				mpage_da_submit_io(&mpd);
-			mpd.io_done = 1;
-			ret = MPAGE_DA_EXTENT_TAIL;
-		}
-		wbc->nr_to_write -= mpd.pages_written;
+		mpd.get_block = ext4_da_get_block_write;
+		ret = mpage_da_writepages(mapping, wbc, &mpd);
 
 		ext4_journal_stop(handle);
 
@@ -2885,48 +2846,6 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
 	return;
 }
 
-/*
- * Force all delayed allocation blocks to be allocated for a given inode.
- */
-int ext4_alloc_da_blocks(struct inode *inode)
-{
-	if (!EXT4_I(inode)->i_reserved_data_blocks &&
-	    !EXT4_I(inode)->i_reserved_meta_blocks)
-		return 0;
-
-	/*
-	 * We do something simple for now.  The filemap_flush() will
-	 * also start triggering a write of the data blocks, which is
-	 * not strictly speaking necessary (and for users of
-	 * laptop_mode, not even desirable).  However, to do otherwise
-	 * would require replicating code paths in:
-	 * 
-	 * ext4_da_writepages() ->
-	 *    write_cache_pages() ---> (via passed in callback function)
-	 *        __mpage_da_writepage() -->
-	 *           mpage_add_bh_to_extent()
-	 *           mpage_da_map_blocks()
-	 *
-	 * The problem is that write_cache_pages(), located in
-	 * mm/page-writeback.c, marks pages clean in preparation for
-	 * doing I/O, which is not desirable if we're not planning on
-	 * doing I/O at all.
-	 *
-	 * We could call write_cache_pages(), and then redirty all of
-	 * the pages by calling redirty_page_for_writeback() but that
-	 * would be ugly in the extreme.  So instead we would need to
-	 * replicate parts of the code in the above functions,
-	 * simplifying them becuase we wouldn't actually intend to
-	 * write out the pages, but rather only collect contiguous
-	 * logical block extents, call the multi-block allocator, and
-	 * then update the buffer heads with the block allocations.
-	 * 
-	 * For now, though, we'll cheat by calling filemap_flush(),
-	 * which will map the blocks, and start the I/O, but not
-	 * actually wait for the I/O to complete.
-	 */
-	return filemap_flush(inode->i_mapping);
-}
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
@@ -3949,9 +3868,6 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
-
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		ext4_ext_truncate(inode);
 		return;
@@ -4194,7 +4110,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
 			unsigned num;
 
 			table = ext4_inode_table(sb, gdp);
-			/* s_inode_readahead_blks is always a power of 2 */
+			/* Make sure s_inode_readahead_blks is a power of 2 */
+			while (EXT4_SB(sb)->s_inode_readahead_blks &
+			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
+				EXT4_SB(sb)->s_inode_readahead_blks = 
+				   (EXT4_SB(sb)->s_inode_readahead_blks &
+				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
 			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
 			if (table > b)
 				b = table;
@@ -4366,7 +4287,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
-	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4409,20 +4329,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 
-	if (ei->i_flags & EXT4_EXTENTS_FL) {
-		/* Validate extent which is part of inode */
-		ret = ext4_ext_check_inode(inode);
- 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		   (S_ISLNK(inode->i_mode) &&
-		    !ext4_inode_is_fast_symlink(inode))) {
-	 	/* Validate block references which are part of inode */
-		ret = ext4_check_inode_blockref(inode);
-	}
-	if (ret) {
- 		brelse(bh);
- 		goto bad_inode;
-	}
-
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
@@ -4439,8 +4345,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
-	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
-	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+	} else {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
@@ -4448,13 +4353,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
-	} else {
-		brelse(bh);
-		ret = -EIO;
-		ext4_error(inode->i_sb, __func__, 
-			   "bogus i_mode (%o) for inode=%lu",
-			   inode->i_mode, inode->i_ino);
-		goto bad_inode;
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
diff --git a/trunk/fs/ext4/ioctl.c b/trunk/fs/ext4/ioctl.c
index 91e75f7a9e73..42dc83fb247a 100644
--- a/trunk/fs/ext4/ioctl.c
+++ b/trunk/fs/ext4/ioctl.c
@@ -48,7 +48,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		flags = ext4_mask_flags(inode->i_mode, flags);
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~EXT4_DIRSYNC_FL;
 
 		err = -EPERM;
 		mutex_lock(&inode->i_mutex);
@@ -262,20 +263,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return err;
 	}
 
-	case EXT4_IOC_ALLOC_DA_BLKS:
-	{
-		int err;
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
-
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
-			return err;
-		err = ext4_alloc_da_blocks(inode);
-		mnt_drop_write(filp->f_path.mnt);
-		return err;
-	}
-
 	default:
 		return -ENOTTY;
 	}
diff --git a/trunk/fs/ext4/mballoc.c b/trunk/fs/ext4/mballoc.c
index f871677a7984..b038188bd039 100644
--- a/trunk/fs/ext4/mballoc.c
+++ b/trunk/fs/ext4/mballoc.c
@@ -46,23 +46,22 @@
  * The allocation request involve request for multiple number of blocks
  * near to the goal(block) value specified.
  *
- * During initialization phase of the allocator we decide to use the
- * group preallocation or inode preallocation depending on the size of
- * the file. The size of the file could be the resulting file size we
- * would have after allocation, or the current file size, which ever
- * is larger. If the size is less than sbi->s_mb_stream_request we
- * select to use the group preallocation. The default value of
- * s_mb_stream_request is 16 blocks. This can also be tuned via
- * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
- * terms of number of blocks.
+ * During initialization phase of the allocator we decide to use the group
+ * preallocation or inode preallocation depending on the size file. The
+ * size of the file could be the resulting file size we would have after
+ * allocation or the current file size which ever is larger. If the size is
+ * less that sbi->s_mb_stream_request we select the group
+ * preallocation. The default value of s_mb_stream_request is 16
+ * blocks. This can also be tuned via
+ * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
+ * of number of blocks.
  *
  * The main motivation for having small file use group preallocation is to
- * ensure that we have small files closer together on the disk.
+ * ensure that we have small file closer in the disk.
  *
- * First stage the allocator looks at the inode prealloc list,
- * ext4_inode_info->i_prealloc_list, which contains list of prealloc
- * spaces for this particular inode. The inode prealloc space is
- * represented as:
+ * First stage the allocator looks at the inode prealloc list
+ * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
+ * this particular inode. The inode prealloc space is represented as:
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
@@ -122,29 +121,29 @@
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
  * 512 blocks. This can be tuned via
- * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
+ * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
  * stripe=<value> option the group prealloc request is normalized to the
  * stripe value (sbi->s_stripe)
  *
- * The regular allocator(using the buddy cache) supports few tunables.
+ * The regular allocator(using the buddy cache) support few tunables.
  *
- * /sys/fs/ext4/<partition>/mb_min_to_scan
- * /sys/fs/ext4/<partition>/mb_max_to_scan
- * /sys/fs/ext4/<partition>/mb_order2_req
+ * /proc/fs/ext4/<partition>/min_to_scan
+ * /proc/fs/ext4/<partition>/max_to_scan
+ * /proc/fs/ext4/<partition>/order2_req
  *
- * The regular allocator uses buddy scan only if the request len is power of
+ * The regular allocator use buddy scan only if the request len is power of
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
- * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
+ * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
  * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setups. If
- * not, we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan control the behaviour here.
+ * stripe size. This should result in better allocation on RAID setup. If
+ * not we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan controll the behaviour here.
  * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scan indicates how long the mballoc __can__ look for a
+ * extent and max_to_scanindicate how long the mballoc __can__ look for a
  * best extent in the found extents. Searching for the blocks starts with
  * the group specified as the goal value in allocation context via
  * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -338,6 +337,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
 
@@ -1725,7 +1726,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
 	unsigned free, fragments;
 	unsigned i, bits;
-	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_desc *desc;
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
@@ -1747,12 +1747,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
 			return 0;
 
-		/* Avoid using the first bg of a flexgroup for data files */
-		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
-		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
-		    ((group % flex_size) == 0))
-			return 0;
-
 		bits = ac->ac_sb->s_blocksize_bits + 1;
 		for (i = ac->ac_2order; i <= bits; i++)
 			if (grp->bb_counters[i] > 0)
@@ -1977,7 +1971,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	/*
 	 * We search using buddy data only if the order of the request
 	 * is greater than equal to the sbi_s_mb_order2_reqs
-	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+	 * You can tune it via /proc/fs/ext4/<partition>/order2_req
 	 */
 	if (i >= sbi->s_mb_order2_reqs) {
 		/*
@@ -2699,7 +2693,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		kfree(sbi->s_mb_offsets);
+		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
 
@@ -2752,6 +2746,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
+	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
 	if (sbi->s_journal)
@@ -2834,6 +2829,7 @@ int ext4_mb_release(struct super_block *sb)
 
 	free_percpu(sbi->s_locality_groups);
 	ext4_mb_history_release(sb);
+	ext4_mb_destroy_per_dev_proc(sb);
 
 	return 0;
 }
@@ -2894,6 +2890,62 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
 
+#define EXT4_MB_STATS_NAME		"stats"
+#define EXT4_MB_MAX_TO_SCAN_NAME	"max_to_scan"
+#define EXT4_MB_MIN_TO_SCAN_NAME	"min_to_scan"
+#define EXT4_MB_ORDER2_REQ		"order2_req"
+#define EXT4_MB_STREAM_REQ		"stream_req"
+#define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
+
+static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+{
+#ifdef CONFIG_PROC_FS
+	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct proc_dir_entry *proc;
+
+	if (sbi->s_proc == NULL)
+		return -EINVAL;
+
+	EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
+	EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+	EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
+	EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
+	return 0;
+
+err_out:
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
+	return -ENOMEM;
+#else
+	return 0;
+#endif
+}
+
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
+{
+#ifdef CONFIG_PROC_FS
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_proc == NULL)
+		return -EINVAL;
+
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
+#endif
+	return 0;
+}
+
 int __init init_ext4_mballoc(void)
 {
 	ext4_pspace_cachep =
@@ -3044,8 +3096,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
-		atomic_sub(ac->ac_b_ex.fe_len,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3063,7 +3116,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
  * here we normalize request for locality group
  * Group request are normalized to s_strip size if we set the same via mount
  * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /sys/fs/ext4/<partition>/mb_group_prealloc
+ * /proc/fs/ext4/<partition>/group_prealloc
  *
  * XXX: should we try to preallocate more than the group has now?
  */
@@ -3555,11 +3608,8 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	spin_unlock(&pa->pa_lock);
 
 	grp_blk = pa->pa_pstart;
-	/* 
-	 * If doing group-based preallocation, pa_pstart may be in the
-	 * next group when pa is used up
-	 */
-	if (pa->pa_type == MB_GROUP_PA)
+	/* If linear, pa_pstart may be in the next group when pa is used up */
+	if (pa->pa_linear)
 		grp_blk--;
 
 	ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3654,7 +3704,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	INIT_LIST_HEAD(&pa->pa_inode_list);
 	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_type = MB_INODE_PA;
+	pa->pa_linear = 0;
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3717,7 +3767,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	INIT_LIST_HEAD(&pa->pa_inode_list);
 	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_type = MB_GROUP_PA;
+	pa->pa_linear = 1;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
 		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3971,7 +4021,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 		list_del_rcu(&pa->pa_inode_list);
 		spin_unlock(pa->pa_obj_lock);
 
-		if (pa->pa_type == MB_GROUP_PA)
+		if (pa->pa_linear)
 			ext4_mb_release_group_pa(&e4b, pa, ac);
 		else
 			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4071,7 +4121,7 @@ void ext4_discard_preallocations(struct inode *inode)
 	spin_unlock(&ei->i_prealloc_lock);
 
 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-		BUG_ON(pa->pa_type != MB_INODE_PA);
+		BUG_ON(pa->pa_linear != 0);
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4182,7 +4232,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  * file is determined by the current size or the resulting size after
  * allocation which ever is larger
  *
- * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
+ * One can tune this size via /proc/fs/ext4/<partition>/stream_req
  */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
@@ -4323,7 +4373,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 			continue;
 		}
 		/* only lg prealloc space */
-		BUG_ON(pa->pa_type != MB_GROUP_PA);
+		BUG_ON(!pa->pa_linear);
 
 		/* seems this one can be freed ... */
 		pa->pa_deleted = 1;
@@ -4392,7 +4442,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 						pa_inode_list) {
 		spin_lock(&tmp_pa->pa_lock);
 		if (tmp_pa->pa_deleted) {
-			spin_unlock(&tmp_pa->pa_lock);
+			spin_unlock(&pa->pa_lock);
 			continue;
 		}
 		if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4429,7 +4479,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
 	struct ext4_prealloc_space *pa = ac->ac_pa;
 	if (pa) {
-		if (pa->pa_type == MB_GROUP_PA) {
+		if (pa->pa_linear) {
 			/* see comment in ext4_mb_use_group_pa() */
 			spin_lock(&pa->pa_lock);
 			pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4449,7 +4499,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 		 * doesn't grow big.  We need to release
 		 * alloc_semp before calling ext4_mb_add_n_trim()
 		 */
-		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+		if (pa->pa_linear && likely(pa->pa_free)) {
 			spin_lock(pa->pa_obj_lock);
 			list_del_rcu(&pa->pa_inode_list);
 			spin_unlock(pa->pa_obj_lock);
@@ -4886,7 +4936,9 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 
 	ext4_mb_release_desc(&e4b);
diff --git a/trunk/fs/ext4/mballoc.h b/trunk/fs/ext4/mballoc.h
index dd9e6cd5f6cf..10a2921baf14 100644
--- a/trunk/fs/ext4/mballoc.h
+++ b/trunk/fs/ext4/mballoc.h
@@ -132,15 +132,12 @@ struct ext4_prealloc_space {
 	ext4_lblk_t		pa_lstart;	/* log. block */
 	unsigned short		pa_len;		/* len of preallocated chunk */
 	unsigned short		pa_free;	/* how many blocks are free */
-	unsigned short		pa_type;	/* pa type. inode or group */
+	unsigned short		pa_linear;	/* consumed in one direction
+						 * strictly, for grp prealloc */
 	spinlock_t		*pa_obj_lock;
 	struct inode		*pa_inode;	/* hack, for history only */
 };
 
-enum {
-	MB_INODE_PA = 0,
-	MB_GROUP_PA = 1
-};
 
 struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
@@ -250,6 +247,7 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
+struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
diff --git a/trunk/fs/ext4/namei.c b/trunk/fs/ext4/namei.c
index 22098e1cd085..83410244d3ee 100644
--- a/trunk/fs/ext4/namei.c
+++ b/trunk/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct dx_frame *frame,
 				 int *err);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
 		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-		struct dx_map_entry *offsets, int count, unsigned blocksize);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
+		struct dx_map_entry *offsets, int count);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,38 +180,14 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
-unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
-{
-	unsigned len = le16_to_cpu(dlen);
-
-	if (len == EXT4_MAX_REC_LEN || len == 0)
-		return blocksize;
-	return (len & 65532) | ((len & 3) << 16);
-}
-  
-__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
-{
-	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
-		BUG();
-	if (len < 65536)
-		return cpu_to_le16(len);
-	if (len == blocksize) {
-		if (blocksize == 65536)
-			return cpu_to_le16(EXT4_MAX_REC_LEN);
-		else 
-			return cpu_to_le16(0);
-	}
-	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-}
-
 /*
  * p is at least 6 bytes before the end of page
  */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+ext4_next_entry(struct ext4_dir_entry_2 *p)
 {
 	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len, blocksize));
+		ext4_rec_len_from_disk(p->rec_len));
 }
 
 /*
@@ -318,7 +294,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
 			space += EXT4_DIR_REC_LEN(de->name_len);
 			names++;
 		}
-		de = ext4_next_entry(de, size);
+		de = ext4_next_entry(de);
 	}
 	printk("(%i)\n", names);
 	return (struct stats) { names, space, 1 };
@@ -609,7 +585,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
-	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+	for (; de < top; de = ext4_next_entry(de)) {
 		if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
 					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 						+((char *)de - bh->b_data))) {
@@ -687,7 +663,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
 		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+		de = ext4_next_entry(de);
 		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
 			goto errout;
 		count++;
@@ -737,15 +713,15 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
  * Create map of hash values, offsets, and sizes, stored at end of block.
  * Returns number of entries mapped.
  */
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
-		       struct dx_hash_info *hinfo,
-		       struct dx_map_entry *map_tail)
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 {
 	int count = 0;
 	char *base = (char *) de;
 	struct dx_hash_info h = *hinfo;
 
-	while ((char *) de < base + blocksize) {
+	while ((char *) de < base + size)
+	{
 		if (de->name_len && de->inode) {
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
@@ -756,7 +732,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 			cond_resched();
 		}
 		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = ext4_next_entry(de, blocksize);
+		de = ext4_next_entry(de);
 	}
 	return count;
 }
@@ -856,8 +832,7 @@ static inline int search_dirblock(struct buffer_head *bh,
 			return 1;
 		}
 		/* prevent looping on a bad block */
-		de_len = ext4_rec_len_from_disk(de->rec_len,
-						dir->i_sb->s_blocksize);
+		de_len = ext4_rec_len_from_disk(de->rec_len);
 		if (de_len <= 0)
 			return -1;
 		offset += de_len;
@@ -1021,7 +996,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		de = (struct ext4_dir_entry_2 *) bh->b_data;
 		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
+		for (; de < top; de = ext4_next_entry(de)) {
 			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
 				  + ((char *) de - bh->b_data);
 
@@ -1077,16 +1052,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
-		if (unlikely(IS_ERR(inode))) {
-			if (PTR_ERR(inode) == -ESTALE) {
-				ext4_error(dir->i_sb, __func__,
-						"deleted inode referenced: %u",
-						ino);
-				return ERR_PTR(-EIO);
-			} else {
-				return ERR_CAST(inode);
-			}
-		}
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
 	}
 	return d_splice_alias(inode, dentry);
 }
@@ -1142,8 +1109,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
  * Returns pointer to last entry moved.
  */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
-		unsigned blocksize)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 {
 	unsigned rec_len = 0;
 
@@ -1152,7 +1118,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-				ext4_rec_len_to_disk(rec_len, blocksize);
+				ext4_rec_len_to_disk(rec_len);
 		de->inode = 0;
 		map++;
 		to += rec_len;
@@ -1164,19 +1130,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
  * Compact each dir entry in the range to the minimal rec_len.
  * Returns pointer to last entry in range.
  */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
 {
 	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
 	unsigned rec_len = 0;
 
 	prev = to = de;
-	while ((char*)de < base + blocksize) {
-		next = ext4_next_entry(de, blocksize);
+	while ((char*)de < base + size) {
+		next = ext4_next_entry(de);
 		if (de->inode && de->name_len) {
 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
 			if (de > to)
 				memmove(to, de, rec_len);
-			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+			to->rec_len = ext4_rec_len_to_disk(rec_len);
 			prev = to;
 			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
 		}
@@ -1249,12 +1215,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 					hash2, split, count-split));
 
 	/* Fancy dance to stay within two buffers */
-	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+	de2 = dx_move_dirents(data1, data2, map + split, count - split);
 	de = dx_pack_dirents(data1, blocksize);
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
-					   blocksize);
-	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
-					    blocksize);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1304,7 +1268,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
 	unsigned int	offset = 0;
-	unsigned int	blocksize = dir->i_sb->s_blocksize;
 	unsigned short	reclen;
 	int		nlen, rlen, err;
 	char		*top;
@@ -1312,7 +1275,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	reclen = EXT4_DIR_REC_LEN(namelen);
 	if (!de) {
 		de = (struct ext4_dir_entry_2 *)bh->b_data;
-		top = bh->b_data + blocksize - reclen;
+		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
 						  bh, offset)) {
@@ -1324,7 +1287,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
-			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+			rlen = ext4_rec_len_from_disk(de->rec_len);
 			if ((de->inode? rlen - nlen: rlen) >= reclen)
 				break;
 			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1343,11 +1306,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	/* By now the buffer is marked for journaling */
 	nlen = EXT4_DIR_REC_LEN(de->name_len);
-	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+	rlen = ext4_rec_len_from_disk(de->rec_len);
 	if (de->inode) {
 		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
-		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
+		de->rec_len = ext4_rec_len_to_disk(nlen);
 		de = de1;
 	}
 	de->file_type = EXT4_FT_UNKNOWN;
@@ -1417,7 +1380,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	/* The 0th block becomes the root, move the dirents out */
 	fde = &root->dotdot;
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
-		ext4_rec_len_from_disk(fde->rec_len, blocksize));
+		ext4_rec_len_from_disk(fde->rec_len));
 	if ((char *) de >= (((char *) root) + blocksize)) {
 		ext4_error(dir->i_sb, __func__,
 			   "invalid rec_len for '..' in inode %lu",
@@ -1439,14 +1402,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+	while ((char *)(de2 = ext4_next_entry(de)) < top)
 		de = de2;
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
-					   blocksize);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
-					   blocksize);
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
 	memset (&root->info, 0, sizeof(root->info));
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1527,7 +1488,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+	de->rec_len = ext4_rec_len_to_disk(blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1590,8 +1551,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
-		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
-							   sb->s_blocksize);
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
 		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1679,7 +1639,6 @@ static int ext4_delete_entry(handle_t *handle,
 			     struct buffer_head *bh)
 {
 	struct ext4_dir_entry_2 *de, *pde;
-	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int i;
 
 	i = 0;
@@ -1693,11 +1652,8 @@ static int ext4_delete_entry(handle_t *handle,
 			ext4_journal_get_write_access(handle, bh);
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
-					ext4_rec_len_from_disk(pde->rec_len,
-							       blocksize) +
-					ext4_rec_len_from_disk(de->rec_len,
-							       blocksize),
-					blocksize);
+					ext4_rec_len_from_disk(pde->rec_len) +
+					ext4_rec_len_from_disk(de->rec_len));
 			else
 				de->inode = 0;
 			dir->i_version++;
@@ -1705,9 +1661,9 @@ static int ext4_delete_entry(handle_t *handle,
 			ext4_handle_dirty_metadata(handle, dir, bh);
 			return 0;
 		}
-		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
+		i += ext4_rec_len_from_disk(de->rec_len);
 		pde = de;
-		de = ext4_next_entry(de, blocksize);
+		de = ext4_next_entry(de);
 	}
 	return -ENOENT;
 }
@@ -1837,7 +1793,6 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct inode *inode;
 	struct buffer_head *dir_block;
 	struct ext4_dir_entry_2 *de;
-	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int err, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
@@ -1869,14 +1824,13 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
-	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
-					   blocksize);
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
 	strcpy(de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = ext4_next_entry(de, blocksize);
+	de = ext4_next_entry(de);
 	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
-					   blocksize);
+	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
+						EXT4_DIR_REC_LEN(1));
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1931,7 +1885,7 @@ static int empty_dir(struct inode *inode)
 		return 1;
 	}
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	de1 = ext4_next_entry(de, sb->s_blocksize);
+	de1 = ext4_next_entry(de);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
 			strcmp(".", de->name) ||
@@ -1942,9 +1896,9 @@ static int empty_dir(struct inode *inode)
 		brelse(bh);
 		return 1;
 	}
-	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
-		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
-	de = ext4_next_entry(de1, sb->s_blocksize);
+	offset = ext4_rec_len_from_disk(de->rec_len) +
+		 ext4_rec_len_from_disk(de1->rec_len);
+	de = ext4_next_entry(de1);
 	while (offset < inode->i_size) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1973,8 +1927,8 @@ static int empty_dir(struct inode *inode)
 			brelse(bh);
 			return 0;
 		}
-		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
-		de = ext4_next_entry(de, sb->s_blocksize);
+		offset += ext4_rec_len_from_disk(de->rec_len);
+		de = ext4_next_entry(de);
 	}
 	brelse(bh);
 	return 1;
@@ -2343,8 +2297,8 @@ static int ext4_link(struct dentry *old_dentry,
 	return err;
 }
 
-#define PARENT_INO(buffer, size) \
-	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+#define PARENT_INO(buffer) \
+	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
@@ -2357,7 +2311,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh, *dir_bh;
 	struct ext4_dir_entry_2 *old_de, *new_de;
-	int retval, force_da_alloc = 0;
+	int retval;
 
 	old_bh = new_bh = dir_bh = NULL;
 
@@ -2404,8 +2358,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
-		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
-				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
@@ -2477,8 +2430,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");
 		ext4_journal_get_write_access(handle, dir_bh);
-		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
-						cpu_to_le32(new_dir->i_ino);
+		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
 		ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
 		ext4_dec_count(handle, old_dir);
@@ -2497,8 +2449,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ext4_mark_inode_dirty(handle, new_inode);
 		if (!new_inode->i_nlink)
 			ext4_orphan_add(handle, new_inode);
-		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
-			force_da_alloc = 1;
 	}
 	retval = 0;
 
@@ -2507,8 +2457,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	brelse(old_bh);
 	brelse(new_bh);
 	ext4_journal_stop(handle);
-	if (retval == 0 && force_da_alloc)
-		ext4_alloc_da_blocks(old_inode);
 	return retval;
 }
 
diff --git a/trunk/fs/ext4/resize.c b/trunk/fs/ext4/resize.c
index 546c7dd869e1..c06886abd658 100644
--- a/trunk/fs/ext4/resize.c
+++ b/trunk/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
-		atomic_add(input->free_blocks_count,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
-		atomic_add(EXT4_INODES_PER_GROUP(sb),
-			   &sbi->s_flex_groups[flex_group].free_inodes);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			input->free_blocks_count;
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			EXT4_INODES_PER_GROUP(sb);
 	}
 
 	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c
index 9987bba99db3..f7371a6a923d 100644
--- a/trunk/fs/ext4/super.c
+++ b/trunk/fs/ext4/super.c
@@ -35,7 +35,6 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
-#include <linux/ctype.h>
 #include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -49,7 +48,6 @@
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
-static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -579,9 +577,9 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_commit_super(sb, es, 1);
 	}
 	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
-	kobject_del(&sbi->s_kobj);
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -617,17 +615,6 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
-	/*
-	 * Now that we are completely done shutting down the
-	 * superblock, we need to actually destroy the kobject.
-	 */
-	unlock_kernel();
-	unlock_super(sb);
-	kobject_put(&sbi->s_kobj);
-	wait_for_completion(&sbi->s_kobj_unregister);
-	lock_super(sb);
-	lock_kernel();
-	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
 }
@@ -816,6 +803,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",noacl");
 #endif
+	if (!test_opt(sb, RESERVATION))
+		seq_puts(seq, ",noreservation");
 	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
@@ -866,9 +855,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
 
-	if (test_opt(sb, NO_AUTO_DA_ALLOC))
-		seq_puts(seq, ",noauto_da_alloc");
-
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -1018,7 +1004,7 @@ enum {
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
@@ -1026,8 +1012,8 @@ enum {
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
-	Opt_usrquota, Opt_grpquota, Opt_i_version,
+	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+	Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1053,6 +1039,8 @@ static const match_table_t tokens = {
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
+	{Opt_reservation, "reservation"},
+	{Opt_noreservation, "noreservation"},
 	{Opt_noload, "noload"},
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
@@ -1080,8 +1068,6 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
-	{Opt_barrier, "barrier"},
-	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
@@ -1089,9 +1075,6 @@ static const match_table_t tokens = {
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
-	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
-	{Opt_auto_da_alloc, "auto_da_alloc"},
-	{Opt_noauto_da_alloc, "noauto_da_alloc"},
 	{Opt_err, NULL},
 };
 
@@ -1224,6 +1207,12 @@ static int parse_options(char *options, struct super_block *sb,
 			       "not supported\n");
 			break;
 #endif
+		case Opt_reservation:
+			set_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_noreservation:
+			clear_opt(sbi->s_mount_opt, RESERVATION);
+			break;
 		case Opt_journal_update:
 			/* @@@ FIXME */
 			/* Eventually we will want to be able to create
@@ -1426,14 +1415,9 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
-		case Opt_nobarrier:
-			clear_opt(sbi->s_mount_opt, BARRIER);
-			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option)) {
-				set_opt(sbi->s_mount_opt, BARRIER);
-				break;
-			}
+			if (match_int(&args[0], &option))
+				return 0;
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1479,11 +1463,6 @@ static int parse_options(char *options, struct super_block *sb,
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
-			if (option & (option - 1)) {
-				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
-				       " must be a power of 2\n");
-				return 0;
-			}
 			sbi->s_inode_readahead_blks = option;
 			break;
 		case Opt_journal_ioprio:
@@ -1494,19 +1473,6 @@ static int parse_options(char *options, struct super_block *sb,
 			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
 							    option);
 			break;
-		case Opt_noauto_da_alloc:
-			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
-			break;
-		case Opt_auto_da_alloc:
-			if (match_int(&args[0], &option)) {
-				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
-				break;
-			}
-			if (option)
-				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
-			else
-				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
-			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1646,12 +1612,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, &bh);
 
 		flex_group = ext4_flex_group(sbi, i);
-		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-			   ext4_free_inodes_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
-			   ext4_free_blks_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
-			   ext4_used_dirs_count(sb, gdp));
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			ext4_free_inodes_count(sb, gdp);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			ext4_free_blks_count(sb, gdp);
 	}
 
 	return 1;
@@ -2027,181 +1991,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 	return 0;
 }
 
-/* sysfs supprt */
-
-struct ext4_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
-			 const char *, size_t);
-	int offset;
-};
-
-static int parse_strtoul(const char *buf,
-		unsigned long max, unsigned long *value)
-{
-	char *endp;
-
-	while (*buf && isspace(*buf))
-		buf++;
-	*value = simple_strtoul(buf, &endp, 0);
-	while (*endp && isspace(*endp))
-		endp++;
-	if (*endp || *value > max)
-		return -EINVAL;
-
-	return 0;
-}
-
-static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
-					      struct ext4_sb_info *sbi,
-					      char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
-}
-
-static ssize_t session_write_kbytes_show(struct ext4_attr *a,
-					 struct ext4_sb_info *sbi, char *buf)
-{
-	struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
-	return snprintf(buf, PAGE_SIZE, "%lu\n",
-			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-			 sbi->s_sectors_written_start) >> 1);
-}
-
-static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
-					  struct ext4_sb_info *sbi, char *buf)
-{
-	struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-			sbi->s_kbytes_written + 
-			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-			  EXT4_SB(sb)->s_sectors_written_start) >> 1));
-}
-
-static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
-					  struct ext4_sb_info *sbi,
-					  const char *buf, size_t count)
-{
-	unsigned long t;
-
-	if (parse_strtoul(buf, 0x40000000, &t))
-		return -EINVAL;
-
-	/* inode_readahead_blks must be a power of 2 */
-	if (t & (t-1))
-		return -EINVAL;
-
-	sbi->s_inode_readahead_blks = t;
-	return count;
-}
-
-static ssize_t sbi_ui_show(struct ext4_attr *a,
-				struct ext4_sb_info *sbi, char *buf)
-{
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t sbi_ui_store(struct ext4_attr *a,
-			    struct ext4_sb_info *sbi,
-			    const char *buf, size_t count)
-{
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
-	unsigned long t;
-
-	if (parse_strtoul(buf, 0xffffffff, &t))
-		return -EINVAL;
-	*ui = t;
-	return count;
-}
-
-#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = {			\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
-	.show	= _show,					\
-	.store	= _store,					\
-	.offset = offsetof(struct ext4_sb_info, _elname),	\
-}
-#define EXT4_ATTR(name, mode, show, store) \
-static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
-
-#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
-#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
-#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
-	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
-#define ATTR_LIST(name) &ext4_attr_##name.attr
-
-EXT4_RO_ATTR(delayed_allocation_blocks);
-EXT4_RO_ATTR(session_write_kbytes);
-EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
-		 inode_readahead_blks_store, s_inode_readahead_blks);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
-EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-
-static struct attribute *ext4_attrs[] = {
-	ATTR_LIST(delayed_allocation_blocks),
-	ATTR_LIST(session_write_kbytes),
-	ATTR_LIST(lifetime_write_kbytes),
-	ATTR_LIST(inode_readahead_blks),
-	ATTR_LIST(mb_stats),
-	ATTR_LIST(mb_max_to_scan),
-	ATTR_LIST(mb_min_to_scan),
-	ATTR_LIST(mb_order2_req),
-	ATTR_LIST(mb_stream_req),
-	ATTR_LIST(mb_group_prealloc),
-	NULL,
-};
-
-static ssize_t ext4_attr_show(struct kobject *kobj,
-			      struct attribute *attr, char *buf)
-{
-	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
-						s_kobj);
-	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
-	return a->show ? a->show(a, sbi, buf) : 0;
-}
-
-static ssize_t ext4_attr_store(struct kobject *kobj,
-			       struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
-						s_kobj);
-	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
-	return a->store ? a->store(a, sbi, buf, len) : 0;
-}
-
-static void ext4_sb_release(struct kobject *kobj)
-{
-	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
-						s_kobj);
-	complete(&sbi->s_kobj_unregister);
-}
-
-
-static struct sysfs_ops ext4_attr_ops = {
-	.show	= ext4_attr_show,
-	.store	= ext4_attr_store,
-};
-
-static struct kobj_type ext4_ktype = {
-	.default_attrs	= ext4_attrs,
-	.sysfs_ops	= &ext4_attr_ops,
-	.release	= ext4_sb_release,
-};
-
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2232,21 +2021,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-
-	sbi->s_blockgroup_lock =
-		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
-	if (!sbi->s_blockgroup_lock) {
-		kfree(sbi);
-		return -ENOMEM;
-	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
-	sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
-						      sectors[1]);
 
 	unlock_kernel();
 
@@ -2284,7 +2064,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT4_SUPER_MAGIC)
 		goto cantfind_ext4;
-	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2322,6 +2101,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
+	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
@@ -2545,9 +2325,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_PROC_FS
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+	if (sbi->s_proc)
+		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+				 &ext4_ui_proc_fops,
+				 &sbi->s_inode_readahead_blks);
 #endif
 
-	bgl_lock_init(sbi->s_blockgroup_lock);
+	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
@@ -2779,16 +2564,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}
 
-	sbi->s_kobj.kset = ext4_kset;
-	init_completion(&sbi->s_kobj_unregister);
-	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
-				   "%s", sb->s_id);
-	if (err) {
-		ext4_mb_release(sb);
-		ext4_ext_release(sb);
-		goto failed_mount4;
-	};
-
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2843,6 +2618,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	kfree(sbi->s_group_desc);
 failed_mount:
 	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 #ifdef CONFIG_QUOTA
@@ -3137,10 +2913,6 @@ static int ext4_commit_super(struct super_block *sb,
 		set_buffer_uptodate(sbh);
 	}
 	es->s_wtime = cpu_to_le32(get_seconds());
-	es->s_kbytes_written =
-		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
-			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeblocks_counter));
 	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3875,6 +3647,45 @@ static int ext4_get_sb(struct file_system_type *fs_type,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
+#ifdef CONFIG_PROC_FS
+static int ext4_ui_proc_show(struct seq_file *m, void *v)
+{
+	unsigned int *p = m->private;
+
+	seq_printf(m, "%u\n", *p);
+	return 0;
+}
+
+static int ext4_ui_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
+}
+
+static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
+			       size_t cnt, loff_t *ppos)
+{
+	unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
+	char str[32];
+
+	if (cnt >= sizeof(str))
+		return -EINVAL;
+	if (copy_from_user(str, buf, cnt))
+		return -EFAULT;
+
+	*p = simple_strtoul(str, NULL, 0);
+	return cnt;
+}
+
+const struct file_operations ext4_ui_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_ui_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= ext4_ui_proc_write,
+};
+#endif
+
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
@@ -3908,9 +3719,6 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
-	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
-	if (!ext4_kset)
-		return -ENOMEM;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
@@ -3952,7 +3760,6 @@ static void __exit exit_ext4_fs(void)
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
-	kset_unregister(ext4_kset);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/trunk/fs/jbd2/commit.c b/trunk/fs/jbd2/commit.c
index 4ea72377c7a2..62804e57a44c 100644
--- a/trunk/fs/jbd2/commit.c
+++ b/trunk/fs/jbd2/commit.c
@@ -367,7 +367,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -402,8 +401,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
-	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -683,7 +680,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				submit_bh(WRITE, bh);
 			}
 			cond_resched();
 			stats.u.run.rs_blocks_logged += bufs;
diff --git a/trunk/fs/jbd2/revoke.c b/trunk/fs/jbd2/revoke.c
index bbe6d592d8b3..257ff2625765 100644
--- a/trunk/fs/jbd2/revoke.c
+++ b/trunk/fs/jbd2/revoke.c
@@ -55,25 +55,6 @@
  *			need do nothing.
  * RevokeValid set, Revoked set:
  *			buffer has been revoked.
- *
- * Locking rules:
- * We keep two hash tables of revoke records. One hashtable belongs to the
- * running transaction (is pointed to by journal->j_revoke), the other one
- * belongs to the committing transaction. Accesses to the second hash table
- * happen only from the kjournald and no other thread touches this table.  Also
- * journal_switch_revoke_table() which switches which hashtable belongs to the
- * running and which to the committing transaction is called only from
- * kjournald. Therefore we need no locks when accessing the hashtable belonging
- * to the committing transaction.
- *
- * All users operating on the hash table belonging to the running transaction
- * have a handle to the transaction. Therefore they are safe from kjournald
- * switching hash tables under them. For operations on the lists of entries in
- * the hash table j_revoke_lock is used.
- *
- * Finally, also replay code uses the hash tables but at this moment noone else
- * can touch them (filesystem isn't mounted yet) and hence no locking is
- * needed.
  */
 
 #ifndef __KERNEL__
@@ -420,6 +401,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
  * the second time we would still have a pending revoke to cancel.  So,
  * do not trust the Revoked bit on buffers unless RevokeValid is also
  * set.
+ *
+ * The caller must have the journal locked.
  */
 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -497,7 +480,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 /*
  * Write revoke records to the journal for all entries in the current
  * revoke hash, deleting the entries as we go.
+ *
+ * Called with the journal lock held.
  */
+
 void jbd2_journal_write_revoke_records(journal_t *journal,
 				  transaction_t *transaction)
 {
diff --git a/trunk/fs/jbd2/transaction.c b/trunk/fs/jbd2/transaction.c
index 996ffda06bf3..28ce21d8598e 100644
--- a/trunk/fs/jbd2/transaction.c
+++ b/trunk/fs/jbd2/transaction.c
@@ -1315,8 +1315,6 @@ int jbd2_journal_stop(handle_t *handle)
 		}
 	}
 
-	if (handle->h_sync)
-		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/trunk/fs/lockd/clntlock.c b/trunk/fs/lockd/clntlock.c
index 1f3b0fc0d351..aedc47a264c1 100644
--- a/trunk/fs/lockd/clntlock.c
+++ b/trunk/fs/lockd/clntlock.c
@@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 	return 0;
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
+						 struct in6_addr *addr_mapped)
+{
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
+
+	switch (sap->sa_family) {
+	case AF_INET6:
+		return &((const struct sockaddr_in6 *)sap)->sin6_addr;
+	case AF_INET:
+		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
+		return addr_mapped;
+	}
+
+	return NULL;
+}
+
+/*
+ * If lockd is using a PF_INET6 listener, all incoming requests appear
+ * to come from AF_INET6 remotes.  The address of AF_INET remotes are
+ * mapped to AF_INET6 automatically by the network layer.  In case the
+ * user passed an AF_INET server address at mount time, ensure both
+ * addresses are AF_INET6 before comparing them.
+ */
+static int nlmclnt_cmp_addr(const struct nlm_host *host,
+			    const struct sockaddr *sap)
+{
+	const struct in6_addr *addr1;
+	const struct in6_addr *addr2;
+	struct in6_addr addr1_mapped;
+	struct in6_addr addr2_mapped;
+
+	addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
+	if (likely(addr1 != NULL)) {
+		addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
+		if (likely(addr2 != NULL))
+			return ipv6_addr_equal(addr1, addr2);
+	}
+
+	return 0;
+}
+#else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+static int nlmclnt_cmp_addr(const struct nlm_host *host,
+			    const struct sockaddr *sap)
+{
+	return nlm_cmp_addr(nlm_addr(host), sap);
+}
+#endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
@@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		 */
 		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
-		if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
+		if (!nlmclnt_cmp_addr(block->b_host, addr))
 			continue;
 		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
 			continue;
diff --git a/trunk/fs/lockd/mon.c b/trunk/fs/lockd/mon.c
index 6d5d4a4169e5..5e2c4d5ac827 100644
--- a/trunk/fs/lockd/mon.c
+++ b/trunk/fs/lockd/mon.c
@@ -16,8 +16,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 
-#include <asm/unaligned.h>
-
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 #define NSM_PROGRAM		100024
 #define NSM_VERSION		1
@@ -276,12 +274,10 @@ static void nsm_init_private(struct nsm_handle *nsm)
 {
 	u64 *p = (u64 *)&nsm->sm_priv.data;
 	struct timespec ts;
-	s64 ns;
 
 	ktime_get_ts(&ts);
-	ns = timespec_to_ns(&ts);
-	put_unaligned(ns, p);
-	put_unaligned((unsigned long)nsm, p + 1);
+	*p++ = timespec_to_ns(&ts);
+	*p = (unsigned long)nsm;
 }
 
 static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/trunk/fs/lockd/svc.c b/trunk/fs/lockd/svc.c
index abf83881f68a..64f1c31b5853 100644
--- a/trunk/fs/lockd/svc.c
+++ b/trunk/fs/lockd/svc.c
@@ -52,6 +52,17 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
+/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+	defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t	nlmsvc_family = AF_INET6;
+#else	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t	nlmsvc_family = AF_INET;
+#endif	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+
 /*
  * These can be set at insmod time (useful for NFS as root filesystem),
  * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
@@ -193,30 +204,19 @@ lockd(void *vrqstp)
 	return 0;
 }
 
-static int create_lockd_listener(struct svc_serv *serv, const char *name,
-				 const int family, const unsigned short port)
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+				 unsigned short port)
 {
 	struct svc_xprt *xprt;
 
-	xprt = svc_find_xprt(serv, name, family, 0);
+	xprt = svc_find_xprt(serv, name, 0, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, family, port,
-						SVC_SOCK_DEFAULTS);
+		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+
 	svc_xprt_put(xprt);
 	return 0;
 }
 
-static int create_lockd_family(struct svc_serv *serv, const int family)
-{
-	int err;
-
-	err = create_lockd_listener(serv, "udp", family, nlm_udpport);
-	if (err < 0)
-		return err;
-
-	return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
-}
-
 /*
  * Ensure there are active UDP and TCP listeners for lockd.
  *
@@ -232,15 +232,13 @@ static int make_socks(struct svc_serv *serv)
 	static int warned;
 	int err;
 
-	err = create_lockd_family(serv, PF_INET);
+	err = create_lockd_listener(serv, "udp", nlm_udpport);
 	if (err < 0)
 		goto out_err;
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	err = create_lockd_family(serv, PF_INET6);
-	if (err < 0 && err != -EAFNOSUPPORT)
+	err = create_lockd_listener(serv, "tcp", nlm_tcpport);
+	if (err < 0)
 		goto out_err;
-#endif	/* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
 
 	warned = 0;
 	return 0;
@@ -276,7 +274,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
diff --git a/trunk/fs/nfs/callback.c b/trunk/fs/nfs/callback.c
index a886e692ddd0..3e634f2a1083 100644
--- a/trunk/fs/nfs/callback.c
+++ b/trunk/fs/nfs/callback.c
@@ -38,10 +38,19 @@ static struct svc_program nfs4_callback_program;
 
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
-unsigned short nfs_callback_tcpport6;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
+/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const sa_family_t	nfs_callback_family = AF_INET6;
+#else
+static const sa_family_t	nfs_callback_family = AF_INET;
+#endif
+
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
 	char *endp;
@@ -107,29 +116,19 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+				nfs_callback_family, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
 
-	ret = svc_create_xprt(serv, "tcp", PF_INET,
-				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+			      SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
-			nfs_callback_tcpport, PF_INET);
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	ret = svc_create_xprt(serv, "tcp", PF_INET6,
-				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
-	if (ret > 0) {
-		nfs_callback_tcpport6 = ret;
-		dprintk("NFS: Callback listener port = %u (af %u)\n",
-				nfs_callback_tcpport6, PF_INET6);
-	} else if (ret != -EAFNOSUPPORT)
-		goto out_err;
-#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+			nfs_callback_tcpport, nfs_callback_family);
 
 	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/trunk/fs/nfs/callback.h b/trunk/fs/nfs/callback.h
index e110e286a262..bb25d2135ff1 100644
--- a/trunk/fs/nfs/callback.h
+++ b/trunk/fs/nfs/callback.h
@@ -72,6 +72,5 @@ extern void nfs_callback_down(void);
 
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
-extern unsigned short nfs_callback_tcpport6;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/trunk/fs/nfs/client.c b/trunk/fs/nfs/client.c
index aba38017bdef..2277421656e7 100644
--- a/trunk/fs/nfs/client.c
+++ b/trunk/fs/nfs/client.c
@@ -224,6 +224,38 @@ void nfs_put_client(struct nfs_client *clp)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
+{
+	switch (sa->sa_family) {
+		default:
+			return NULL;
+		case AF_INET6:
+			return &((const struct sockaddr_in6 *)sa)->sin6_addr;
+			break;
+		case AF_INET:
+			ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
+					addr_mapped);
+			return addr_mapped;
+	}
+}
+
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+		const struct sockaddr *sa2)
+{
+	const struct in6_addr *addr1;
+	const struct in6_addr *addr2;
+	struct in6_addr addr1_mapped;
+	struct in6_addr addr2_mapped;
+
+	addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
+	if (likely(addr1 != NULL)) {
+		addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
+		if (likely(addr2 != NULL))
+			return ipv6_addr_equal(addr1, addr2);
+	}
+	return 0;
+}
+
 /*
  * Test if two ip6 socket addresses refer to the same socket by
  * comparing relevant fields. The padding bytes specifically, are not
@@ -235,21 +267,38 @@ void nfs_put_client(struct nfs_client *clp)
  *
  * The caller should ensure both socket addresses are AF_INET6.
  */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
 {
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+	const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
 
-	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-	    sin1->sin6_scope_id != sin2->sin6_scope_id)
+	if (!ipv6_addr_equal(&saddr1->sin6_addr,
+			     &saddr1->sin6_addr))
 		return 0;
+	if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+	    saddr1->sin6_scope_id != saddr2->sin6_scope_id)
+		return 0;
+	return saddr1->sin6_port == saddr2->sin6_port;
+}
+#else
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+				 const struct sockaddr_in *sa2)
+{
+	return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+}
 
-	return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+				 const struct sockaddr *sa2)
+{
+	if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
+		return 0;
+	return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+			(const struct sockaddr_in *)sa2);
 }
-#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
+				const struct sockaddr * sa2)
 {
 	return 0;
 }
@@ -262,57 +311,20 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
  *
  * The caller should ensure both socket addresses are AF_INET.
  */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
-				const struct sockaddr *sa2)
-{
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
-		(sin1->sin6_port == sin2->sin6_port);
-}
-
 static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
 				const struct sockaddr *sa2)
 {
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+	const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
 
-	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
-		(sin1->sin_port == sin2->sin_port);
-}
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-				     const struct sockaddr *sa2)
-{
-	if (sa1->sa_family != sa2->sa_family)
+	if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
 		return 0;
-
-	switch (sa1->sa_family) {
-	case AF_INET:
-		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
-	case AF_INET6:
-		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
-	}
-	return 0;
+	return saddr1->sin_port == saddr2->sin_port;
 }
 
 /*
  * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
+ * by comparing (only) relevant fields.
  */
 static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
 			    const struct sockaddr *sa2)
diff --git a/trunk/fs/nfs/dir.c b/trunk/fs/nfs/dir.c
index 370b190a09d1..78bf72fc1db3 100644
--- a/trunk/fs/nfs/dir.c
+++ b/trunk/fs/nfs/dir.c
@@ -1624,7 +1624,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		} else if (atomic_read(&new_dentry->d_count) > 1)
 			/* dentry still busy? */
 			goto out;
-	}
+	} else
+		nfs_drop_nlink(new_inode);
 
 go_ahead:
 	/*
@@ -1637,8 +1638,10 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	nfs_inode_return_delegation(old_inode);
 
-	if (new_inode != NULL)
+	if (new_inode != NULL) {
 		nfs_inode_return_delegation(new_inode);
+		d_delete(new_dentry);
+	}
 
 	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
 					   new_dir, &new_dentry->d_name);
@@ -1647,8 +1650,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (rehash)
 		d_rehash(rehash);
 	if (!error) {
-		if (new_inode != NULL)
-			nfs_drop_nlink(new_inode);
 		d_move(old_dentry, new_dentry);
 		nfs_set_verifier(new_dentry,
 					nfs_save_change_attribute(new_dir));
diff --git a/trunk/fs/nfs/file.c b/trunk/fs/nfs/file.c
index 0abf3f331f56..cec79392e4ba 100644
--- a/trunk/fs/nfs/file.c
+++ b/trunk/fs/nfs/file.c
@@ -64,7 +64,11 @@ const struct file_operations nfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= nfs_file_read,
 	.aio_write	= nfs_file_write,
+#ifdef CONFIG_MMU
 	.mmap		= nfs_file_mmap,
+#else
+	.mmap		= generic_file_mmap,
+#endif
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
 	.release	= nfs_file_release,
@@ -137,6 +141,9 @@ nfs_file_release(struct inode *inode, struct file *filp)
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 
+	/* Ensure that dirty pages are flushed out with the right creds */
+	if (filp->f_mode & FMODE_WRITE)
+		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
 }
@@ -228,6 +235,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
+	int		status;
 
 	dprintk("NFS: flush(%s/%s)\n",
 			dentry->d_parent->d_name.name,
@@ -237,8 +245,11 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 
-	/* Flush writes to the server and return any errors */
-	return nfs_do_fsync(ctx, inode);
+	/* Ensure that data+attribute caches are up to date after close() */
+	status = nfs_do_fsync(ctx, inode);
+	if (!status)
+		nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	return status;
 }
 
 static ssize_t
@@ -293,13 +304,11 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dprintk("NFS: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
-	 *       so we call that before revalidating the mapping
-	 */
-	status = generic_file_mmap(file, vma);
+	status = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (!status) {
 		vma->vm_ops = &nfs_file_vm_ops;
-		status = nfs_revalidate_mapping(inode, file->f_mapping);
+		vma->vm_flags |= VM_CAN_NONLINEAR;
+		file_accessed(file);
 	}
 	return status;
 }
@@ -345,15 +354,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
-	/*
-	 * Prevent starvation issues if someone is doing a consistency
-	 * sync-to-disk
-	 */
-	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
-	if (ret)
-		return ret;
-
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
diff --git a/trunk/fs/nfs/getroot.c b/trunk/fs/nfs/getroot.c
index 46177cb87064..b7c9b2df1f29 100644
--- a/trunk/fs/nfs/getroot.c
+++ b/trunk/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
 		return ret;
 	}
 
-	if (!S_ISDIR(fattr.mode)) {
+	if (fattr.type != NFDIR) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " getroot encountered non-directory\n");
 		return -ENOTDIR;
@@ -213,7 +213,7 @@ int nfs4_path_walk(struct nfs_server *server,
 		return ret;
 	}
 
-	if (!S_ISDIR(fattr.mode)) {
+	if (fattr.type != NFDIR) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " lookupfh encountered non-directory\n");
 		return -ENOTDIR;
diff --git a/trunk/fs/nfs/inode.c b/trunk/fs/nfs/inode.c
index a834d1d850b7..0c381686171e 100644
--- a/trunk/fs/nfs/inode.c
+++ b/trunk/fs/nfs/inode.c
@@ -65,18 +65,6 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(void *word)
-{
-	if (fatal_signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
@@ -261,10 +249,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 	struct inode *inode = ERR_PTR(-ENOENT);
 	unsigned long hash;
 
-	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
 		goto out_no_inode;
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
+
+	if (!fattr->nlink) {
+		printk("NFS: Buggy server - nlink == 0!\n");
 		goto out_no_inode;
+	}
 
 	hash = nfs_fattr_to_ino_t(fattr);
 
@@ -300,8 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
-			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
-					&& !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
 				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
 					inode->i_op = &nfs_referral_inode_operations;
 				else
@@ -314,45 +304,28 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
-		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
-		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
-		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
-		nfsi->change_attr = 0;
-		inode->i_size = 0;
-		inode->i_nlink = 0;
-		inode->i_uid = -2;
-		inode->i_gid = -2;
-		inode->i_blocks = 0;
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
-		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-			inode->i_atime = fattr->atime;
-		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			inode->i_mtime = fattr->mtime;
-		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
-			inode->i_ctime = fattr->ctime;
-		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+		inode->i_atime = fattr->atime;
+		inode->i_mtime = fattr->mtime;
+		inode->i_ctime = fattr->ctime;
+		if (fattr->valid & NFS_ATTR_FATTR_V4)
 			nfsi->change_attr = fattr->change_attr;
-		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
-			inode->i_size = nfs_size_to_loff_t(fattr->size);
-		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
-			inode->i_nlink = fattr->nlink;
-		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
-			inode->i_uid = fattr->uid;
-		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
-			inode->i_gid = fattr->gid;
-		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
-			inode->i_blocks = fattr->du.nfs2.blocks;
-		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+		inode->i_size = nfs_size_to_loff_t(fattr->size);
+		inode->i_nlink = fattr->nlink;
+		inode->i_uid = fattr->uid;
+		inode->i_gid = fattr->gid;
+		if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
 			/*
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+		} else {
+			inode->i_blocks = fattr->du.nfs2.blocks;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 		nfsi->access_cache = RB_ROOT;
 
 		unlock_new_inode(inode);
@@ -541,32 +514,6 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	return err;
 }
 
-/**
- * nfs_close_context - Common close_context() routine NFSv2/v3
- * @ctx: pointer to context
- * @is_sync: is this a synchronous close
- *
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
- */
-void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
-{
-	struct inode *inode;
-	struct nfs_server *server;
-
-	if (!(ctx->mode & FMODE_WRITE))
-		return;
-	if (!is_sync)
-		return;
-	inode = ctx->path.dentry->d_inode;
-	if (!list_empty(&NFS_I(inode)->open_files))
-		return;
-	server = NFS_SERVER(inode);
-	if (server->flags & NFS_MOUNT_NOCTO)
-		return;
-	nfs_revalidate_inode(server, inode);
-}
-
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
@@ -593,15 +540,24 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 	return ctx;
 }
 
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
 {
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode;
+
+	if (ctx == NULL)
+		return;
 
+	inode = ctx->path.dentry->d_inode;
 	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
-	NFS_PROTO(inode)->close_context(ctx, is_sync);
+	if (ctx->state != NULL) {
+		if (wait)
+			nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+		else
+			nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+	}
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
@@ -714,6 +670,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (NFS_STALE(inode))
 		goto out;
 
+	if (NFS_STALE(inode))
+		goto out;
+
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
 	if (status != 0) {
@@ -856,31 +815,25 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
-			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-			&& nfsi->change_attr == fattr->pre_change_attr) {
+	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
+			nfsi->change_attr == fattr->pre_change_attr) {
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
-	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
-			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
-			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
+		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-
-	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
-			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-	}
-	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
-			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
-			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-			&& nfsi->npages == 0)
+		}
+		if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
+		    nfsi->npages == 0)
 			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+	}
 }
 
 /**
@@ -900,39 +853,35 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 
 	/* Has the inode gone and changed behind our back? */
-	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
-		return -EIO;
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if (nfsi->fileid != fattr->fileid
+			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
 		return -EIO;
+	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
-	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-		cur_size = i_size_read(inode);
-		new_isize = nfs_size_to_loff_t(fattr->size);
-		if (cur_size != new_isize && nfsi->npages == 0)
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-	}
+	cur_size = i_size_read(inode);
+ 	new_isize = nfs_size_to_loff_t(fattr->size);
+	if (cur_size != new_isize && nfsi->npages == 0)
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Have any file permissions changed? */
-	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
-		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
-	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
-		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
-	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
+	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
+			|| inode->i_uid != fattr->uid
+			|| inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
-	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
+	if (inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_ATTR;
 
-	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
+	if (!timespec_equal(&inode->i_atime, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
@@ -944,15 +893,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
-	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
-		return 0;
 	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
-	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
-		return 0;
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
@@ -1088,31 +1033,20 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	/* Don't do a WCC update if these attributes are already stale */
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
 			!nfs_inode_attrs_need_update(inode, fattr)) {
-		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
-				| NFS_ATTR_FATTR_PRESIZE
-				| NFS_ATTR_FATTR_PREMTIME
-				| NFS_ATTR_FATTR_PRECTIME);
+		fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
 		goto out_noforce;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
 		fattr->pre_change_attr = NFS_I(inode)->change_attr;
-		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+		fattr->valid |= NFS_ATTR_WCC_V4;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
-			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
+			(fattr->valid & NFS_ATTR_WCC) == 0) {
 		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
-		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
-	}
-	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
-			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
 		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
-		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
-	}
-	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
-			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
 		fattr->pre_size = i_size_read(inode);
-		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
+		fattr->valid |= NFS_ATTR_WCC;
 	}
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1144,18 +1078,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+	if (nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
-	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+	if (S_ISDIR(inode->i_mode) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
 			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
 		server->fsid = fattr->fsid;
@@ -1165,27 +1099,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
-	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
-		    | NFS_INO_INVALID_ATIME
-		    | NFS_INO_REVAL_PAGECACHE);
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
-	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
-		if (nfsi->change_attr != fattr->change_attr) {
-			dprintk("NFS: change_attr change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			if (S_ISDIR(inode->i_mode))
-				nfs_force_lookup_revalidate(inode);
-			nfsi->change_attr = fattr->change_attr;
-		}
-	}
-
-	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
+	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
 		/* NFSv2/v3: Check if the mtime agrees */
 		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 			dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1193,80 +1114,59 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
-			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
-	}
-	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
-		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			/* and probably clear data for a directory too as utimes can cause
-			 * havoc with our cache.
-			 */
-			if (S_ISDIR(inode->i_mode)) {
-				invalid |= NFS_INO_INVALID_DATA;
-				nfs_force_lookup_revalidate(inode);
-			}
-			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		}
+	} else if (nfsi->change_attr != fattr->change_attr) {
+		dprintk("NFS: change_attr change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		if (S_ISDIR(inode->i_mode))
+			nfs_force_lookup_revalidate(inode);
 	}
 
 	/* Check if our cached file size is stale */
-	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-		new_isize = nfs_size_to_loff_t(fattr->size);
-		cur_isize = i_size_read(inode);
-		if (new_isize != cur_isize) {
-			/* Do we perhaps have any outstanding writes, or has
-			 * the file grown beyond our last write? */
-			if (nfsi->npages == 0 || new_isize > cur_isize) {
-				i_size_write(inode, new_isize);
-				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-			}
-			dprintk("NFS: isize change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
+ 	new_isize = nfs_size_to_loff_t(fattr->size);
+	cur_isize = i_size_read(inode);
+	if (new_isize != cur_isize) {
+		/* Do we perhaps have any outstanding writes, or has
+		 * the file grown beyond our last write? */
+		if (nfsi->npages == 0 || new_isize > cur_isize) {
+			i_size_write(inode, new_isize);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 		}
+		dprintk("NFS: isize change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
 	}
 
 
-	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	nfsi->change_attr = fattr->change_attr;
 
-	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
-		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			inode->i_mode = fattr->mode;
-		}
-	}
-	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
-		if (inode->i_uid != fattr->uid) {
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			inode->i_uid = fattr->uid;
-		}
-	}
-	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
-		if (inode->i_gid != fattr->gid) {
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			inode->i_gid = fattr->gid;
-		}
-	}
+	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
+	    inode->i_uid != fattr->uid ||
+	    inode->i_gid != fattr->gid)
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 
-	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
-		if (inode->i_nlink != fattr->nlink) {
-			invalid |= NFS_INO_INVALID_ATTR;
-			if (S_ISDIR(inode->i_mode))
-				invalid |= NFS_INO_INVALID_DATA;
-			inode->i_nlink = fattr->nlink;
-		}
-	}
+	if (inode->i_nlink != fattr->nlink)
+		invalid |= NFS_INO_INVALID_ATTR;
+
+	inode->i_mode = fattr->mode;
+	inode->i_nlink = fattr->nlink;
+	inode->i_uid = fattr->uid;
+	inode->i_gid = fattr->gid;
 
-	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+	if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
 		/*
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ 	} else {
+ 		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
-	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
-		inode->i_blocks = fattr->du.nfs2.blocks;
 
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1374,6 +1274,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/trunk/fs/nfs/internal.h b/trunk/fs/nfs/internal.h
index 2041f68ff1cc..340ede8f608f 100644
--- a/trunk/fs/nfs/internal.h
+++ b/trunk/fs/nfs/internal.h
@@ -152,9 +152,6 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
-/* proc.c */
-void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
-
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 
@@ -168,7 +165,6 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
-extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/trunk/fs/nfs/nfs2xdr.c b/trunk/fs/nfs/nfs2xdr.c
index c862c9340f9a..28bab67d1519 100644
--- a/trunk/fs/nfs/nfs2xdr.c
+++ b/trunk/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-	u32 rdev, type;
-	type = ntohl(*p++);
+	u32 rdev;
+	fattr->type = (enum nfs_ftype) ntohl(*p++);
 	fattr->mode = ntohl(*p++);
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -136,9 +136,10 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	p = xdr_decode_time(p, &fattr->ctime);
-	fattr->valid |= NFS_ATTR_FATTR_V2;
+	fattr->valid |= NFS_ATTR_FATTR;
 	fattr->rdev = new_decode_dev(rdev);
-	if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
+		fattr->type = NFFIFO;
 		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
 		fattr->rdev = 0;
 	}
diff --git a/trunk/fs/nfs/nfs3proc.c b/trunk/fs/nfs/nfs3proc.c
index b82fe6847f14..c55be7a7679e 100644
--- a/trunk/fs/nfs/nfs3proc.c
+++ b/trunk/fs/nfs/nfs3proc.c
@@ -834,5 +834,4 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
-	.close_context	= nfs_close_context,
 };
diff --git a/trunk/fs/nfs/nfs3xdr.c b/trunk/fs/nfs/nfs3xdr.c
index e6a1932c7110..6cdeacffde46 100644
--- a/trunk/fs/nfs/nfs3xdr.c
+++ b/trunk/fs/nfs/nfs3xdr.c
@@ -91,15 +91,19 @@
 /*
  * Map file type to S_IFMT bits
  */
-static const umode_t nfs_type2fmt[] = {
-	[NF3BAD] = 0,
-	[NF3REG] = S_IFREG,
-	[NF3DIR] = S_IFDIR,
-	[NF3BLK] = S_IFBLK,
-	[NF3CHR] = S_IFCHR,
-	[NF3LNK] = S_IFLNK,
-	[NF3SOCK] = S_IFSOCK,
-	[NF3FIFO] = S_IFIFO,
+static struct {
+	unsigned int	mode;
+	unsigned int	nfs2type;
+} nfs_type2fmt[] = {
+      { 0,		NFNON	},
+      { S_IFREG,	NFREG	},
+      { S_IFDIR,	NFDIR	},
+      { S_IFBLK,	NFBLK	},
+      { S_IFCHR,	NFCHR	},
+      { S_IFLNK,	NFLNK	},
+      { S_IFSOCK,	NFSOCK	},
+      { S_IFIFO,	NFFIFO	},
+      { 0,		NFBAD	}
 };
 
 /*
@@ -144,12 +148,13 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	unsigned int	type, major, minor;
-	umode_t		fmode;
+	int		fmode;
 
 	type = ntohl(*p++);
-	if (type > NF3FIFO)
-		type = NF3NON;
-	fmode = nfs_type2fmt[type];
+	if (type >= NF3BAD)
+		type = NF3BAD;
+	fmode = nfs_type2fmt[type].mode;
+	fattr->type = nfs_type2fmt[type].nfs2type;
 	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -172,7 +177,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time3(p, &fattr->ctime);
 
 	/* Update the mode bits */
-	fattr->valid |= NFS_ATTR_FATTR_V3;
+	fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
 	return p;
 }
 
@@ -228,9 +233,7 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_hyper(p, &fattr->pre_size);
 	p = xdr_decode_time3(p, &fattr->pre_mtime);
 	p = xdr_decode_time3(p, &fattr->pre_ctime);
-	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
-		| NFS_ATTR_FATTR_PREMTIME
-		| NFS_ATTR_FATTR_PRECTIME;
+	fattr->valid |= NFS_ATTR_WCC;
 	return p;
 }
 
diff --git a/trunk/fs/nfs/nfs4proc.c b/trunk/fs/nfs/nfs4proc.c
index 97bacccff579..8dde84b988d9 100644
--- a/trunk/fs/nfs/nfs4proc.c
+++ b/trunk/fs/nfs/nfs4proc.c
@@ -193,6 +193,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start, KM_USER0);
 }
 
+static int nfs4_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -200,7 +208,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 	might_sleep();
 
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
+			nfs4_wait_bit_killable, TASK_KILLABLE);
 	return res;
 }
 
@@ -1431,7 +1439,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
 	calldata->arg.fmode = 0;
-	calldata->arg.bitmask = server->cache_consistency_bitmask;
+	calldata->arg.bitmask = server->attr_bitmask;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
@@ -1572,15 +1580,6 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 	return 0;
 }
 
-void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
-{
-	if (ctx->state == NULL)
-		return;
-	if (is_sync)
-		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-	else
-		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-}
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -1601,9 +1600,6 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
-		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
-		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
-		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
 	return status;
@@ -2083,7 +2079,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->cache_consistency_bitmask;
+	args->bitmask = server->attr_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2327,7 +2323,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pages = &page,
 		.pgbase = 0,
 		.count = count,
-		.bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
+		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
@@ -2556,7 +2552,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	data->args.bitmask = server->cache_consistency_bitmask;
+	data->args.bitmask = server->attr_bitmask;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
@@ -2579,7 +2575,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	
-	data->args.bitmask = server->cache_consistency_bitmask;
+	data->args.bitmask = server->attr_bitmask;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
@@ -3682,19 +3678,6 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
-static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
-{
-	if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
-		(fattr->valid & NFS_ATTR_FATTR_FSID) &&
-		(fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
-		return;
-
-	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
-		NFS_ATTR_FATTR_NLINK;
-	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
-	fattr->nlink = 2;
-}
-
 int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page)
 {
@@ -3721,7 +3704,6 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
 	status = rpc_call_sync(server->client, &msg, 0);
-	nfs_fixup_referral_attributes(&fs_locations->fattr);
 	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
 }
@@ -3785,7 +3767,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
-	.close_context  = nfs4_close_context,
 };
 
 /*
diff --git a/trunk/fs/nfs/nfs4state.c b/trunk/fs/nfs/nfs4state.c
index 0298e909559f..2022fe47966f 100644
--- a/trunk/fs/nfs/nfs4state.c
+++ b/trunk/fs/nfs/nfs4state.c
@@ -62,14 +62,8 @@ static LIST_HEAD(nfs4_clientid_list);
 
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
-	unsigned short port;
-	int status;
-
-	port = nfs_callback_tcpport;
-	if (clp->cl_addr.ss_family == AF_INET6)
-		port = nfs_callback_tcpport6;
-
-	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
+	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
+			nfs_callback_tcpport, cred);
 	if (status == 0)
 		status = nfs4_proc_setclientid_confirm(clp, cred);
 	if (status == 0)
diff --git a/trunk/fs/nfs/nfs4xdr.c b/trunk/fs/nfs/nfs4xdr.c
index 1690f0e44b91..d1e4c8f8a0a9 100644
--- a/trunk/fs/nfs/nfs4xdr.c
+++ b/trunk/fs/nfs/nfs4xdr.c
@@ -522,17 +522,20 @@ static int nfs4_stat_to_errno(int);
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
 
-static const umode_t nfs_type2fmt[] = {
-	[NF4BAD] = 0,
-	[NF4REG] = S_IFREG,
-	[NF4DIR] = S_IFDIR,
-	[NF4BLK] = S_IFBLK,
-	[NF4CHR] = S_IFCHR,
-	[NF4LNK] = S_IFLNK,
-	[NF4SOCK] = S_IFSOCK,
-	[NF4FIFO] = S_IFIFO,
-	[NF4ATTRDIR] = 0,
-	[NF4NAMEDATTR] = 0,
+static struct {
+	unsigned int	mode;
+	unsigned int	nfs2type;
+} nfs_type2fmt[] = {
+	{ 0,		NFNON	     },
+	{ S_IFREG,	NFREG	     },
+	{ S_IFDIR,	NFDIR	     },
+	{ S_IFBLK,	NFBLK	     },
+	{ S_IFCHR,	NFCHR	     },
+	{ S_IFLNK,	NFLNK	     },
+	{ S_IFSOCK,	NFSOCK	     },
+	{ S_IFIFO,	NFFIFO	     },
+	{ 0,		NFNON	     },
+	{ 0,		NFNON	     },
 };
 
 struct compound_hdr {
@@ -2157,7 +2160,6 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*type = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2170,16 +2172,14 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 			return -EIO;
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
-		ret = NFS_ATTR_FATTR_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
-	return ret;
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
+	return 0;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*change = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,17 +2188,15 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ_BUF(8);
 		READ64(*change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
-		ret = NFS_ATTR_FATTR_CHANGE;
 	}
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*size = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2207,10 +2205,9 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		READ_BUF(8);
 		READ64(*size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
-		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2248,7 +2245,6 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	__be32 *p;
-	int ret = 0;
 
 	fsid->major = 0;
 	fsid->minor = 0;
@@ -2259,12 +2255,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		READ64(fsid->major);
 		READ64(fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
-		ret = NFS_ATTR_FATTR_FSID;
 	}
 	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2302,7 +2297,6 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*fileid = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2311,16 +2305,14 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ_BUF(8);
 		READ64(*fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
-		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*fileid = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2329,10 +2321,9 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		READ_BUF(8);
 		READ64(*fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2488,8 +2479,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
 			res->nlocations++;
 	}
-	if (res->nlocations != 0)
-		status = NFS_ATTR_FATTR_V4_REFERRAL;
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
@@ -2591,30 +2580,26 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	return status;
 }
 
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
 {
-	uint32_t tmp;
 	__be32 *p;
-	int ret = 0;
 
 	*mode = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(tmp);
-		*mode = tmp & ~S_IFMT;
+		READ32(*mode);
+		*mode &= ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
-		ret = NFS_ATTR_FATTR_MODE;
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*nlink = 1;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2623,17 +2608,15 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		READ_BUF(4);
 		READ32(*nlink);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
-		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
 {
 	uint32_t len;
 	__be32 *p;
-	int ret = 0;
 
 	*uid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2643,9 +2626,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ32(len);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
-				ret = NFS_ATTR_FATTR_OWNER;
-			else
+			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
 						__func__);
 		} else
@@ -2654,14 +2635,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
 	dprintk("%s: uid=%d\n", __func__, (int)*uid);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
 {
 	uint32_t len;
 	__be32 *p;
-	int ret = 0;
 
 	*gid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2671,9 +2651,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ32(len);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
-				ret = NFS_ATTR_FATTR_GROUP;
-			else
+			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
 						__func__);
 		} else
@@ -2682,14 +2660,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
 	dprintk("%s: gid=%d\n", __func__, (int)*gid);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
 	uint32_t major = 0, minor = 0;
 	__be32 *p;
-	int ret = 0;
 
 	*rdev = MKDEV(0,0);
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2704,10 +2681,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
 		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
-		ret = NFS_ATTR_FATTR_RDEV;
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2764,7 +2740,6 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
 	__be32 *p;
-	int ret = 0;
 
 	*used = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2773,11 +2748,10 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ_BUF(8);
 		READ64(*used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
-		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
-	return ret;
+	return 0;
 }
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2804,8 +2778,6 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
 		status = decode_attr_time(xdr, time);
-		if (status == 0)
-			status = NFS_ATTR_FATTR_ATIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
 	}
 	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2822,8 +2794,6 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
 		status = decode_attr_time(xdr, time);
-		if (status == 0)
-			status = NFS_ATTR_FATTR_CTIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
 	}
 	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2840,8 +2810,6 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
 		status = decode_attr_time(xdr, time);
-		if (status == 0)
-			status = NFS_ATTR_FATTR_MTIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
 	}
 	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -3026,116 +2994,63 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	uint32_t attrlen,
 		 bitmap[2] = {0},
 		 type;
-	int status;
-	umode_t fmode = 0;
+	int status, fmode = 0;
 	uint64_t fileid;
 
-	status = decode_op_hdr(xdr, OP_GETATTR);
-	if (status < 0)
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
 		goto xdr_error;
-
-	status = decode_attr_bitmap(xdr, bitmap);
-	if (status < 0)
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto xdr_error;
 
-	status = decode_attr_length(xdr, &attrlen, &savep);
-	if (status < 0)
+	fattr->bitmap[0] = bitmap[0];
+	fattr->bitmap[1] = bitmap[1];
+
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
 		goto xdr_error;
 
 
-	status = decode_attr_type(xdr, bitmap, &type);
-	if (status < 0)
+	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
-	fattr->mode = 0;
-	if (status != 0) {
-		fattr->mode |= nfs_type2fmt[type];
-		fattr->valid |= status;
-	}
+	fattr->type = nfs_type2fmt[type].nfs2type;
+	fmode = nfs_type2fmt[type].mode;
 
-	status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
-	if (status < 0)
+	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_size(xdr, bitmap, &fattr->size);
-	if (status < 0)
+	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
-	if (status < 0)
+	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
-	if (status < 0)
+	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
 						struct nfs4_fs_locations,
-						fattr));
-	if (status < 0)
+						fattr))) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_mode(xdr, bitmap, &fmode);
-	if (status < 0)
+	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
 		goto xdr_error;
-	if (status != 0) {
-		fattr->mode |= fmode;
-		fattr->valid |= status;
-	}
-
-	status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
-	if (status < 0)
+	fattr->mode |= fmode;
+	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
-	if (status < 0)
+	if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
-	if (status < 0)
+	if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
-	if (status < 0)
+	if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
-	if (status < 0)
+	if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
-	if (status < 0)
+	if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
-	if (status < 0)
+	if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
-	if (status < 0)
+	if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
 		goto xdr_error;
-	fattr->valid |= status;
-
-	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
-	if (status < 0)
+	if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
 		goto xdr_error;
-	if (status != 0 && !(fattr->valid & status)) {
+	if (fattr->fileid == 0 && fileid != 0)
 		fattr->fileid = fileid;
-		fattr->valid |= status;
-	}
-
-	status = verify_attr_len(xdr, savep, attrlen);
+	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
+		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
@@ -4163,7 +4078,9 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
 	status = decode_setattr(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	status = decode_getfattr(&xdr, res->fattr, res->server);
+	if (status == NFS4ERR_DELAY)
+		status = 0;
 out:
 	return status;
 }
diff --git a/trunk/fs/nfs/pagelist.c b/trunk/fs/nfs/pagelist.c
index e2975939126a..7f079209d70a 100644
--- a/trunk/fs/nfs/pagelist.c
+++ b/trunk/fs/nfs/pagelist.c
@@ -176,6 +176,17 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
+static int nfs_wait_bit_killable(void *word)
+{
+	int ret = 0;
+
+	if (fatal_signal_pending(current))
+		ret = -ERESTARTSYS;
+	else
+		schedule();
+	return ret;
+}
+
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
diff --git a/trunk/fs/nfs/proc.c b/trunk/fs/nfs/proc.c
index 7be72d90d49d..193465210d7c 100644
--- a/trunk/fs/nfs/proc.c
+++ b/trunk/fs/nfs/proc.c
@@ -663,5 +663,4 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.commit_setup	= nfs_proc_commit_setup,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
-	.close_context	= nfs_close_context,
 };
diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c
index 0942fcbbad3c..d6686f4786dc 100644
--- a/trunk/fs/nfs/super.c
+++ b/trunk/fs/nfs/super.c
@@ -1018,7 +1018,6 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_rdma:
 			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-			xprt_load_transport(p);
 			break;
 		case Opt_acl:
 			mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1206,14 +1205,12 @@ static int nfs_parse_mount_options(char *raw,
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-				xprt_load_transport(string);
 				break;
 			default:
 				errors++;
 				dfprintk(MOUNT, "NFS:   unrecognized "
 						"transport protocol\n");
 			}
-			kfree(string);
 			break;
 		case Opt_mountproto:
 			string = match_strdup(args);
@@ -1221,6 +1218,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
+			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
diff --git a/trunk/fs/nfs/write.c b/trunk/fs/nfs/write.c
index e560a78995a3..9f9845859fc1 100644
--- a/trunk/fs/nfs/write.c
+++ b/trunk/fs/nfs/write.c
@@ -313,34 +313,19 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
-	/* Stop dirtying of new pages while we sync */
-	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
-	if (err)
-		goto out_err;
-
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
-
-	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
-	smp_mb__after_clear_bit();
-	wake_up_bit(bitlock, NFS_INO_FLUSHING);
-
 	if (err < 0)
-		goto out_err;
-	err = pgio.pg_error;
-	if (err < 0)
-		goto out_err;
+		return err;
+	if (pgio.pg_error < 0)
+		return pgio.pg_error;
 	return 0;
-out_err:
-	return err;
 }
 
 /*
@@ -419,6 +404,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
+	nfsi->ncommit++;
 	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
@@ -538,12 +524,6 @@ static void nfs_cancel_commit_list(struct list_head *head)
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-static int
-nfs_need_commit(struct nfs_inode *nfsi)
-{
-	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
-}
-
 /*
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
@@ -558,18 +538,16 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int res = 0;
 
-	if (!nfs_need_commit(nfsi))
-		return 0;
-
-	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+	if (nfsi->ncommit != 0) {
+		res = nfs_scan_list(nfsi, dst, idx_start, npages,
+				NFS_PAGE_TAG_COMMIT);
+		nfsi->ncommit -= res;
+	}
+	return res;
 }
 #else
-static inline int nfs_need_commit(struct nfs_inode *nfsi)
-{
-	return 0;
-}
-
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
@@ -842,7 +820,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
-		if (!nfs_need_commit(NFS_I(inode)))
+		if (!NFS_I(inode)->ncommit)
 			data->args.stable = NFS_FILE_SYNC;
 	}
 
@@ -1447,13 +1425,18 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
+		.sync_mode = WB_SYNC_NONE,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 		.for_writepages = 1,
 	};
+	int ret;
 
+	ret = __nfs_write_mapping(mapping, &wbc, how);
+	if (ret < 0)
+		return ret;
+	wbc.sync_mode = WB_SYNC_ALL;
 	return __nfs_write_mapping(mapping, &wbc, how);
 }
 
diff --git a/trunk/fs/nfsd/nfsctl.c b/trunk/fs/nfsd/nfsctl.c
index a4ed8644d69c..3d93b2064ce5 100644
--- a/trunk/fs/nfsd/nfsctl.c
+++ b/trunk/fs/nfsd/nfsctl.c
@@ -938,12 +938,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 		char transport[16];
 		int port;
 		if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
-			if (port < 1 || port > 65535)
-				return -EINVAL;
 			err = nfsd_create_serv();
 			if (!err) {
 				err = svc_create_xprt(nfsd_serv,
-						      transport, PF_INET, port,
+						      transport, port,
 						      SVC_SOCK_ANONYMOUS);
 				if (err == -ENOENT)
 					/* Give a reasonable perror msg for
@@ -962,7 +960,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 		char transport[16];
 		int port;
 		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
-			if (port < 1 || port > 65535)
+			if (port == 0)
 				return -EINVAL;
 			if (nfsd_serv) {
 				xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/trunk/fs/nfsd/nfssvc.c b/trunk/fs/nfsd/nfssvc.c
index bc3567bab8c4..07e4f5d7baa8 100644
--- a/trunk/fs/nfsd/nfssvc.c
+++ b/trunk/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
 
 	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      AF_INET,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
@@ -243,7 +244,7 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+	error = svc_create_xprt(nfsd_serv, "udp", port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
@@ -252,7 +253,7 @@ static int nfsd_init_socks(int port)
 	if (error < 0)
 		return error;
 
-	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+	error = svc_create_xprt(nfsd_serv, "tcp", port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
diff --git a/trunk/include/drm/drm_crtc_helper.h b/trunk/include/drm/drm_crtc_helper.h
index ec073d8288d9..c7d4b2e606a5 100644
--- a/trunk/include/drm/drm_crtc_helper.h
+++ b/trunk/include/drm/drm_crtc_helper.h
@@ -33,6 +33,7 @@
 #ifndef __DRM_CRTC_HELPER_H__
 #define __DRM_CRTC_HELPER_H__
 
+#include <linux/i2c.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/idr.h>
@@ -91,7 +92,7 @@ struct drm_connector_helper_funcs {
 extern int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY);
 extern void drm_helper_disable_unused_functions(struct drm_device *dev);
 extern int drm_helper_hotplug_stage_two(struct drm_device *dev);
-extern bool drm_helper_initial_config(struct drm_device *dev);
+extern bool drm_helper_initial_config(struct drm_device *dev, bool can_grow);
 extern int drm_crtc_helper_set_config(struct drm_mode_set *set);
 extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc,
 				     struct drm_display_mode *mode,
diff --git a/trunk/include/drm/drm_os_linux.h b/trunk/include/drm/drm_os_linux.h
index 26641e95e0a4..013551d03c03 100644
--- a/trunk/include/drm/drm_os_linux.h
+++ b/trunk/include/drm/drm_os_linux.h
@@ -7,12 +7,12 @@
 #include <linux/delay.h>
 
 #ifndef readq
-static inline u64 readq(void __iomem *reg)
+static u64 readq(void __iomem *reg)
 {
 	return ((u64) readl(reg)) | (((u64) readl(reg + 4UL)) << 32);
 }
 
-static inline void writeq(u64 val, void __iomem *reg)
+static void writeq(u64 val, void __iomem *reg)
 {
 	writel(val & 0xffffffff, reg);
 	writel(val >> 32, reg + 0x4UL);
diff --git a/trunk/include/linux/jbd2.h b/trunk/include/linux/jbd2.h
index 8815a3456b3b..4d248b3f1323 100644
--- a/trunk/include/linux/jbd2.h
+++ b/trunk/include/linux/jbd2.h
@@ -648,12 +648,6 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
-	/*
-	 * This transaction is being forced and some process is
-	 * waiting for it to finish.
-	 */
-	int t_synchronous_commit:1;
-
 	/*
 	 * For use by the filesystem to store fs-specific data
 	 * structures associated with the transaction
diff --git a/trunk/include/linux/nfs_fs.h b/trunk/include/linux/nfs_fs.h
index bde2557c2a9c..8cc8807f77d6 100644
--- a/trunk/include/linux/nfs_fs.h
+++ b/trunk/include/linux/nfs_fs.h
@@ -166,7 +166,8 @@ struct nfs_inode {
 	 */
 	struct radix_tree_root	nfs_page_tree;
 
-	unsigned long		npages;
+	unsigned long		ncommit,
+				npages;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
@@ -206,7 +207,6 @@ struct nfs_inode {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
-#define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
diff --git a/trunk/include/linux/nfs_fs_sb.h b/trunk/include/linux/nfs_fs_sb.h
index 29b1e40dce99..9bb81aec91cf 100644
--- a/trunk/include/linux/nfs_fs_sb.h
+++ b/trunk/include/linux/nfs_fs_sb.h
@@ -106,11 +106,6 @@ struct nfs_server {
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
 						   filesystem */
-	u32			cache_consistency_bitmask[2];
-						/* V4 bitmask representing the subset
-						   of change attribute, size, ctime
-						   and mtime attributes supported by
-						   the server */
 	u32			acl_bitmask;	/* V4 bitmask representing the ACEs
 						   that are supported on this
 						   filesystem */
diff --git a/trunk/include/linux/nfs_xdr.h b/trunk/include/linux/nfs_xdr.h
index b89c34e40bc2..43a713fce11c 100644
--- a/trunk/include/linux/nfs_xdr.h
+++ b/trunk/include/linux/nfs_xdr.h
@@ -27,8 +27,12 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 }
 
 struct nfs_fattr {
-	unsigned int		valid;		/* which fields are valid */
-	umode_t			mode;
+	unsigned short		valid;		/* which fields are valid */
+	__u64			pre_size;	/* pre_op_attr.size	  */
+	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
+	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
+	enum nfs_ftype		type;		/* always use NFSv2 types */
+	__u32			mode;
 	__u32			nlink;
 	__u32			uid;
 	__u32			gid;
@@ -48,55 +52,19 @@ struct nfs_fattr {
 	struct timespec		atime;
 	struct timespec		mtime;
 	struct timespec		ctime;
+	__u32			bitmap[2];	/* NFSv4 returned attribute bitmap */
 	__u64			change_attr;	/* NFSv4 change attribute */
 	__u64			pre_change_attr;/* pre-op NFSv4 change attribute */
-	__u64			pre_size;	/* pre_op_attr.size	  */
-	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
-	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	unsigned long		time_start;
 	unsigned long		gencount;
 };
 
-#define NFS_ATTR_FATTR_TYPE		(1U << 0)
-#define NFS_ATTR_FATTR_MODE		(1U << 1)
-#define NFS_ATTR_FATTR_NLINK		(1U << 2)
-#define NFS_ATTR_FATTR_OWNER		(1U << 3)
-#define NFS_ATTR_FATTR_GROUP		(1U << 4)
-#define NFS_ATTR_FATTR_RDEV		(1U << 5)
-#define NFS_ATTR_FATTR_SIZE		(1U << 6)
-#define NFS_ATTR_FATTR_PRESIZE		(1U << 7)
-#define NFS_ATTR_FATTR_BLOCKS_USED	(1U << 8)
-#define NFS_ATTR_FATTR_SPACE_USED	(1U << 9)
-#define NFS_ATTR_FATTR_FSID		(1U << 10)
-#define NFS_ATTR_FATTR_FILEID		(1U << 11)
-#define NFS_ATTR_FATTR_ATIME		(1U << 12)
-#define NFS_ATTR_FATTR_MTIME		(1U << 13)
-#define NFS_ATTR_FATTR_CTIME		(1U << 14)
-#define NFS_ATTR_FATTR_PREMTIME		(1U << 15)
-#define NFS_ATTR_FATTR_PRECTIME		(1U << 16)
-#define NFS_ATTR_FATTR_CHANGE		(1U << 17)
-#define NFS_ATTR_FATTR_PRECHANGE	(1U << 18)
-#define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 19)	/* NFSv4 referral */
-
-#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
-		| NFS_ATTR_FATTR_MODE \
-		| NFS_ATTR_FATTR_NLINK \
-		| NFS_ATTR_FATTR_OWNER \
-		| NFS_ATTR_FATTR_GROUP \
-		| NFS_ATTR_FATTR_RDEV \
-		| NFS_ATTR_FATTR_SIZE \
-		| NFS_ATTR_FATTR_FSID \
-		| NFS_ATTR_FATTR_FILEID \
-		| NFS_ATTR_FATTR_ATIME \
-		| NFS_ATTR_FATTR_MTIME \
-		| NFS_ATTR_FATTR_CTIME)
-#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
-		| NFS_ATTR_FATTR_BLOCKS_USED)
-#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
-		| NFS_ATTR_FATTR_SPACE_USED)
-#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
-		| NFS_ATTR_FATTR_SPACE_USED \
-		| NFS_ATTR_FATTR_CHANGE)
+#define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
+#define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
+#define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
+#define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
+#define NFS_ATTR_WCC_V4		0x0010		/* pre-op change attribute */
+#define NFS_ATTR_FATTR_V4_REFERRAL	0x0020		/* NFSv4 referral */
 
 /*
  * Info on the file system
@@ -868,7 +836,6 @@ struct nfs_rpc_ops {
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
-	void	(*close_context)(struct nfs_open_context *ctx, int);
 };
 
 /*
diff --git a/trunk/include/linux/sunrpc/svc.h b/trunk/include/linux/sunrpc/svc.h
index d3a4c0231933..3435d24bfe55 100644
--- a/trunk/include/linux/sunrpc/svc.h
+++ b/trunk/include/linux/sunrpc/svc.h
@@ -69,6 +69,7 @@ struct svc_serv {
 	struct list_head	sv_tempsocks;	/* all temporary sockets */
 	int			sv_tmpcnt;	/* count of temporary sockets */
 	struct timer_list	sv_temptimer;	/* timer for aging temporary sockets */
+	sa_family_t		sv_family;	/* listener's address family */
 
 	char *			sv_name;	/* service name */
 
@@ -384,19 +385,19 @@ struct svc_procedure {
 /*
  * Function prototypes.
  */
-struct svc_serv *svc_create(struct svc_program *, unsigned int,
+struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
 			    void (*shutdown)(struct svc_serv *));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			void (*shutdown)(struct svc_serv *),
+			sa_family_t, void (*shutdown)(struct svc_serv *),
 			svc_thread_fn, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
-int		   svc_register(const struct svc_serv *, const int,
-				const unsigned short, const unsigned short);
+int		   svc_register(const struct svc_serv *, const unsigned short,
+				const unsigned short);
 
 void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
diff --git a/trunk/include/linux/sunrpc/svc_xprt.h b/trunk/include/linux/sunrpc/svc_xprt.h
index 0d9cb6ef28b0..0127daca4354 100644
--- a/trunk/include/linux/sunrpc/svc_xprt.h
+++ b/trunk/include/linux/sunrpc/svc_xprt.h
@@ -71,8 +71,7 @@ int	svc_reg_xprt_class(struct svc_xprt_class *);
 void	svc_unreg_xprt_class(struct svc_xprt_class *);
 void	svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
-int	svc_create_xprt(struct svc_serv *, const char *, const int,
-			const unsigned short, int);
+int	svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_received(struct svc_xprt *);
 void	svc_xprt_put(struct svc_xprt *xprt);
@@ -81,8 +80,7 @@ void	svc_close_xprt(struct svc_xprt *xprt);
 void	svc_delete_xprt(struct svc_xprt *xprt);
 int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
-struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
-			const sa_family_t af, const unsigned short port);
+struct	svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
 int	svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
@@ -90,32 +88,29 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
 	kref_get(&xprt->xpt_ref);
 }
 static inline void svc_xprt_set_local(struct svc_xprt *xprt,
-				      const struct sockaddr *sa,
-				      const size_t salen)
+				      struct sockaddr *sa, int salen)
 {
 	memcpy(&xprt->xpt_local, sa, salen);
 	xprt->xpt_locallen = salen;
 }
 static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
-				       const struct sockaddr *sa,
-				       const size_t salen)
+				       struct sockaddr *sa, int salen)
 {
 	memcpy(&xprt->xpt_remote, sa, salen);
 	xprt->xpt_remotelen = salen;
 }
-static inline unsigned short svc_addr_port(const struct sockaddr *sa)
+static inline unsigned short svc_addr_port(struct sockaddr *sa)
 {
-	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
-	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
-
+	unsigned short ret = 0;
 	switch (sa->sa_family) {
 	case AF_INET:
-		return ntohs(sin->sin_port);
+		ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
+		break;
 	case AF_INET6:
-		return ntohs(sin6->sin6_port);
+		ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
+		break;
 	}
-
-	return 0;
+	return ret;
 }
 
 static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -129,39 +124,36 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	return -EAFNOSUPPORT;
 }
 
-static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
 {
-	return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
+	return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
 }
 
-static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
 {
-	return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
+	return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
 }
 
-static inline char *__svc_print_addr(const struct sockaddr *addr,
-				     char *buf, const size_t len)
+static inline char *__svc_print_addr(struct sockaddr *addr,
+				     char *buf, size_t len)
 {
-	const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
-	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
-
 	switch (addr->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
-			ntohs(sin->sin_port));
+		snprintf(buf, len, "%pI4, port=%u",
+			&((struct sockaddr_in *)addr)->sin_addr,
+			ntohs(((struct sockaddr_in *) addr)->sin_port));
 		break;
 
 	case AF_INET6:
 		snprintf(buf, len, "%pI6, port=%u",
-			 &sin6->sin6_addr,
-			ntohs(sin6->sin6_port));
+			 &((struct sockaddr_in6 *)addr)->sin6_addr,
+			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
 		break;
 
 	default:
 		snprintf(buf, len, "unknown address type: %d", addr->sa_family);
 		break;
 	}
-
 	return buf;
 }
 #endif /* SUNRPC_SVC_XPRT_H */
diff --git a/trunk/include/linux/sunrpc/xprt.h b/trunk/include/linux/sunrpc/xprt.h
index 1758d9f5b5c3..11fc71d50c1e 100644
--- a/trunk/include/linux/sunrpc/xprt.h
+++ b/trunk/include/linux/sunrpc/xprt.h
@@ -235,7 +235,6 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
-int			xprt_load_transport(const char *);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
@@ -260,7 +259,6 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
-#define XPRT_CONNECTION_ABORT	(7)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/trunk/net/sunrpc/Kconfig b/trunk/net/sunrpc/Kconfig
index afd91c78ce8e..5592883e1e4a 100644
--- a/trunk/net/sunrpc/Kconfig
+++ b/trunk/net/sunrpc/Kconfig
@@ -17,6 +17,28 @@ config SUNRPC_XPRT_RDMA
 
 	  If unsure, say N.
 
+config SUNRPC_REGISTER_V4
+	bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
+	depends on SUNRPC && EXPERIMENTAL
+	default n
+	help
+	  Sun added support for registering RPC services at an IPv6
+	  address by creating two new versions of the rpcbind protocol
+	  (RFC 1833).
+
+	  This option enables support in the kernel RPC server for
+	  registering kernel RPC services via version 4 of the rpcbind
+	  protocol.  If you enable this option, you must run a portmapper
+	  daemon that supports rpcbind protocol version 4.
+
+	  Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+	  requires that you enable this option and use a portmapper that
+	  supports rpcbind version 4.
+
+	  If unsure, say N to get traditional behavior (register kernel
+	  RPC services using only rpcbind version 2).  Distributions
+	  using the legacy Linux portmapper daemon must say N here.
+
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
diff --git a/trunk/net/sunrpc/clnt.c b/trunk/net/sunrpc/clnt.c
index 5abab094441f..836f15c0c4a3 100644
--- a/trunk/net/sunrpc/clnt.c
+++ b/trunk/net/sunrpc/clnt.c
@@ -1032,20 +1032,27 @@ call_connect_status(struct rpc_task *task)
 	dprint_status(task);
 
 	task->tk_status = 0;
-	if (status >= 0 || status == -EAGAIN) {
+	if (status >= 0) {
 		clnt->cl_stats->netreconn++;
 		task->tk_action = call_transmit;
 		return;
 	}
 
+	/* Something failed: remote service port may have changed */
+	rpc_force_rebind(clnt);
+
 	switch (status) {
+	case -ENOTCONN:
+	case -EAGAIN:
+		task->tk_action = call_bind;
+		if (!RPC_IS_SOFT(task))
+			return;
 		/* if soft mounted, test if we've timed out */
 	case -ETIMEDOUT:
 		task->tk_action = call_timeout;
-		break;
-	default:
-		rpc_exit(task, -EIO);
+		return;
 	}
+	rpc_exit(task, -EIO);
 }
 
 /*
@@ -1098,26 +1105,14 @@ static void
 call_transmit_status(struct rpc_task *task)
 {
 	task->tk_action = call_status;
-	switch (task->tk_status) {
-	case -EAGAIN:
-		break;
-	default:
-		xprt_end_transmit(task);
-		/*
-		 * Special cases: if we've been waiting on the
-		 * socket's write_space() callback, or if the
-		 * socket just returned a connection error,
-		 * then hold onto the transport lock.
-		 */
-	case -ECONNREFUSED:
-	case -ECONNRESET:
-	case -ENOTCONN:
-	case -EHOSTDOWN:
-	case -EHOSTUNREACH:
-	case -ENETUNREACH:
-	case -EPIPE:
-		rpc_task_force_reencode(task);
-	}
+	/*
+	 * Special case: if we've been waiting on the socket's write_space()
+	 * callback, then don't call xprt_end_transmit().
+	 */
+	if (task->tk_status == -EAGAIN)
+		return;
+	xprt_end_transmit(task);
+	rpc_task_force_reencode(task);
 }
 
 /*
@@ -1157,12 +1152,9 @@ call_status(struct rpc_task *task)
 			xprt_conditional_disconnect(task->tk_xprt,
 					req->rq_connect_cookie);
 		break;
-	case -ECONNRESET:
 	case -ECONNREFUSED:
-		rpc_force_rebind(clnt);
-		rpc_delay(task, 3*HZ);
-	case -EPIPE:
 	case -ENOTCONN:
+		rpc_force_rebind(clnt);
 		task->tk_action = call_bind;
 		break;
 	case -EAGAIN:
diff --git a/trunk/net/sunrpc/rpcb_clnt.c b/trunk/net/sunrpc/rpcb_clnt.c
index beee6da33035..03ae007641e4 100644
--- a/trunk/net/sunrpc/rpcb_clnt.c
+++ b/trunk/net/sunrpc/rpcb_clnt.c
@@ -63,16 +63,9 @@ enum {
  * r_owner
  *
  * The "owner" is allowed to unset a service in the rpcbind database.
- *
- * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a
- * UID which it maps to a local user name via a password lookup.
- * In all other cases it is ignored.
- *
- * For SET/UNSET requests, user space provides a value, even for
- * network requests, and GETADDR uses an empty string.  We follow
- * those precedents here.
+ * We always use the following (arbitrary) fixed string.
  */
-#define RPCB_OWNER_STRING	"0"
+#define RPCB_OWNER_STRING	"rpcb"
 #define RPCB_MAXOWNERLEN	sizeof(RPCB_OWNER_STRING)
 
 static void			rpcb_getport_done(struct rpc_task *, void *);
@@ -131,6 +124,12 @@ static const struct sockaddr_in rpcb_inaddr_loopback = {
 	.sin_port		= htons(RPCBIND_PORT),
 };
 
+static const struct sockaddr_in6 rpcb_in6addr_loopback = {
+	.sin6_family		= AF_INET6,
+	.sin6_addr		= IN6ADDR_LOOPBACK_INIT,
+	.sin6_port		= htons(RPCBIND_PORT),
+};
+
 static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr,
 					  size_t addrlen, u32 version)
 {
@@ -177,10 +176,9 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 	return rpc_create(&args);
 }
 
-static int rpcb_register_call(const u32 version, struct rpc_message *msg)
+static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
+			      u32 version, struct rpc_message *msg)
 {
-	struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback;
-	size_t addrlen = sizeof(rpcb_inaddr_loopback);
 	struct rpc_clnt *rpcb_clnt;
 	int result, error = 0;
 
@@ -194,7 +192,7 @@ static int rpcb_register_call(const u32 version, struct rpc_message *msg)
 		error = PTR_ERR(rpcb_clnt);
 
 	if (error < 0) {
-		dprintk("RPC:       failed to contact local rpcbind "
+		printk(KERN_WARNING "RPC: failed to contact local rpcbind "
 				"server (errno %d).\n", -error);
 		return error;
 	}
@@ -256,23 +254,25 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 	if (port)
 		msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
 
-	return rpcb_register_call(RPCBVERS_2, &msg);
+	return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
+					sizeof(rpcb_inaddr_loopback),
+					RPCBVERS_2, &msg);
 }
 
 /*
  * Fill in AF_INET family-specific arguments to register
  */
-static int rpcb_register_inet4(const struct sockaddr *sap,
-			       struct rpc_message *msg)
+static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
+				struct rpc_message *msg)
 {
-	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
 	struct rpcbind_args *map = msg->rpc_argp;
-	unsigned short port = ntohs(sin->sin_port);
+	unsigned short port = ntohs(address_to_register->sin_port);
 	char buf[32];
 
 	/* Construct AF_INET universal address */
 	snprintf(buf, sizeof(buf), "%pI4.%u.%u",
-		 &sin->sin_addr.s_addr, port >> 8, port & 0xff);
+		 &address_to_register->sin_addr.s_addr,
+		 port >> 8, port & 0xff);
 	map->r_addr = buf;
 
 	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
@@ -284,27 +284,29 @@ static int rpcb_register_inet4(const struct sockaddr *sap,
 	if (port)
 		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
 
-	return rpcb_register_call(RPCBVERS_4, msg);
+	return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
+					sizeof(rpcb_inaddr_loopback),
+					RPCBVERS_4, msg);
 }
 
 /*
  * Fill in AF_INET6 family-specific arguments to register
  */
-static int rpcb_register_inet6(const struct sockaddr *sap,
-			       struct rpc_message *msg)
+static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
+				struct rpc_message *msg)
 {
-	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
 	struct rpcbind_args *map = msg->rpc_argp;
-	unsigned short port = ntohs(sin6->sin6_port);
+	unsigned short port = ntohs(address_to_register->sin6_port);
 	char buf[64];
 
 	/* Construct AF_INET6 universal address */
-	if (ipv6_addr_any(&sin6->sin6_addr))
+	if (ipv6_addr_any(&address_to_register->sin6_addr))
 		snprintf(buf, sizeof(buf), "::.%u.%u",
 				port >> 8, port & 0xff);
 	else
 		snprintf(buf, sizeof(buf), "%pI6.%u.%u",
-			 &sin6->sin6_addr, port >> 8, port & 0xff);
+			 &address_to_register->sin6_addr,
+			 port >> 8, port & 0xff);
 	map->r_addr = buf;
 
 	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
@@ -316,21 +318,9 @@ static int rpcb_register_inet6(const struct sockaddr *sap,
 	if (port)
 		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
 
-	return rpcb_register_call(RPCBVERS_4, msg);
-}
-
-static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
-{
-	struct rpcbind_args *map = msg->rpc_argp;
-
-	dprintk("RPC:       unregistering [%u, %u, '%s'] with "
-		"local rpcbind\n",
-			map->r_prog, map->r_vers, map->r_netid);
-
-	map->r_addr = "";
-	msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
-
-	return rpcb_register_call(RPCBVERS_4, msg);
+	return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
+					sizeof(rpcb_in6addr_loopback),
+					RPCBVERS_4, msg);
 }
 
 /**
@@ -350,11 +340,10 @@ static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
  * invoke this function once for each [program, version, address,
  * netid] tuple they wish to advertise.
  *
- * Callers may also unregister RPC services that are registered at a
- * specific address by setting the port number in @address to zero.
- * They may unregister all registered protocol families at once for
- * a service by passing a NULL @address argument.  If @netid is ""
- * then all netids for [program, version, address] are unregistered.
+ * Callers may also unregister RPC services that are no longer
+ * available by setting the port number in the passed-in address
+ * to zero.  Callers pass a netid of "" to unregister all
+ * transport netids associated with [program, version, address].
  *
  * This function uses rpcbind protocol version 4 to contact the
  * local rpcbind daemon.  The local rpcbind daemon must support
@@ -389,14 +378,13 @@ int rpcb_v4_register(const u32 program, const u32 version,
 		.rpc_argp	= &map,
 	};
 
-	if (address == NULL)
-		return rpcb_unregister_all_protofamilies(&msg);
-
 	switch (address->sa_family) {
 	case AF_INET:
-		return rpcb_register_inet4(address, &msg);
+		return rpcb_register_netid4((struct sockaddr_in *)address,
+					    &msg);
 	case AF_INET6:
-		return rpcb_register_inet6(address, &msg);
+		return rpcb_register_netid6((struct sockaddr_in6 *)address,
+					    &msg);
 	}
 
 	return -EAFNOSUPPORT;
@@ -591,7 +579,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	map->r_xprt = xprt_get(xprt);
 	map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
 	map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR);
-	map->r_owner = "";
+	map->r_owner = RPCB_OWNER_STRING;	/* ignored for GETADDR */
 	map->r_status = -EIO;
 
 	child = rpcb_call_async(rpcb_clnt, map, proc);
@@ -715,16 +703,11 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
 	*portp = 0;
 	addr_len = ntohl(*p++);
 
-	if (addr_len == 0) {
-		dprintk("RPC:       rpcb_decode_getaddr: "
-					"service is not registered\n");
-		return 0;
-	}
-
 	/*
-	 * Simple sanity check.
+	 * Simple sanity check.  The smallest possible universal
+	 * address is an IPv4 address string containing 11 bytes.
 	 */
-	if (addr_len > RPCBIND_MAXUADDRLEN)
+	if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN)
 		goto out_err;
 
 	/*
diff --git a/trunk/net/sunrpc/svc.c b/trunk/net/sunrpc/svc.c
index 9f2f2412a2f3..bb507e2bb94d 100644
--- a/trunk/net/sunrpc/svc.c
+++ b/trunk/net/sunrpc/svc.c
@@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	     void (*shutdown)(struct svc_serv *serv))
+	   sa_family_t family, void (*shutdown)(struct svc_serv *serv))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -368,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
+	serv->sv_family    = family;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -426,21 +427,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-	   void (*shutdown)(struct svc_serv *serv))
+		sa_family_t family, void (*shutdown)(struct svc_serv *serv))
 {
-	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  void (*shutdown)(struct svc_serv *serv),
+		  sa_family_t family, void (*shutdown)(struct svc_serv *serv),
 		  svc_thread_fn func, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, shutdown);
+	serv = __svc_create(prog, bufsize, npools, family, shutdown);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
@@ -718,6 +719,8 @@ svc_exit_thread(struct svc_rqst *rqstp)
 }
 EXPORT_SYMBOL_GPL(svc_exit_thread);
 
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
 /*
  * Register an "inet" protocol family netid with the local
  * rpcbind daemon via an rpcbind v4 SET request.
@@ -732,13 +735,12 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
 				const unsigned short protocol,
 				const unsigned short port)
 {
-	const struct sockaddr_in sin = {
+	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
 		.sin_addr.s_addr	= htonl(INADDR_ANY),
 		.sin_port		= htons(port),
 	};
-	const char *netid;
-	int error;
+	char *netid;
 
 	switch (protocol) {
 	case IPPROTO_UDP:
@@ -748,23 +750,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
 		netid = RPCBIND_NETID_TCP;
 		break;
 	default:
-		return -ENOPROTOOPT;
+		return -EPROTONOSUPPORT;
 	}
 
-	error = rpcb_v4_register(program, version,
-					(const struct sockaddr *)&sin, netid);
-
-	/*
-	 * User space didn't support rpcbind v4, so retry this
-	 * registration request with the legacy rpcbind v2 protocol.
-	 */
-	if (error == -EPROTONOSUPPORT)
-		error = rpcb_register(program, version, protocol, port);
-
-	return error;
+	return rpcb_v4_register(program, version,
+				(struct sockaddr *)&sin, netid);
 }
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
  * Register an "inet6" protocol family netid with the local
  * rpcbind daemon via an rpcbind v4 SET request.
@@ -779,13 +771,12 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
 				const unsigned short protocol,
 				const unsigned short port)
 {
-	const struct sockaddr_in6 sin6 = {
+	struct sockaddr_in6 sin6 = {
 		.sin6_family		= AF_INET6,
 		.sin6_addr		= IN6ADDR_ANY_INIT,
 		.sin6_port		= htons(port),
 	};
-	const char *netid;
-	int error;
+	char *netid;
 
 	switch (protocol) {
 	case IPPROTO_UDP:
@@ -795,22 +786,12 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
 		netid = RPCBIND_NETID_TCP6;
 		break;
 	default:
-		return -ENOPROTOOPT;
+		return -EPROTONOSUPPORT;
 	}
 
-	error = rpcb_v4_register(program, version,
-					(const struct sockaddr *)&sin6, netid);
-
-	/*
-	 * User space didn't support rpcbind version 4, so we won't
-	 * use a PF_INET6 listener.
-	 */
-	if (error == -EPROTONOSUPPORT)
-		error = -EAFNOSUPPORT;
-
-	return error;
+	return rpcb_v4_register(program, version,
+				(struct sockaddr *)&sin6, netid);
 }
-#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 
 /*
  * Register a kernel RPC service via rpcbind version 4.
@@ -818,43 +799,69 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
  * Returns zero on success; a negative errno value is returned
  * if any error occurs.
  */
-static int __svc_register(const char *progname,
-			  const u32 program, const u32 version,
-			  const int family,
+static int __svc_register(const u32 program, const u32 version,
+			  const sa_family_t family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
-	int error = -EAFNOSUPPORT;
+	int error;
 
 	switch (family) {
-	case PF_INET:
-		error = __svc_rpcb_register4(program, version,
+	case AF_INET:
+		return __svc_rpcb_register4(program, version,
 						protocol, port);
-		break;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	case PF_INET6:
+	case AF_INET6:
 		error = __svc_rpcb_register6(program, version,
 						protocol, port);
-#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+		if (error < 0)
+			return error;
+
+		/*
+		 * Work around bug in some versions of Linux rpcbind
+		 * which don't allow registration of both inet and
+		 * inet6 netids.
+		 *
+		 * Error return ignored for now.
+		 */
+		__svc_rpcb_register4(program, version,
+						protocol, port);
+		return 0;
 	}
 
-	if (error < 0)
-		printk(KERN_WARNING "svc: failed to register %sv%u RPC "
-			"service (errno %d).\n", progname, version, -error);
-	return error;
+	return -EAFNOSUPPORT;
+}
+
+#else	/* CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
+ * Register a kernel RPC service via rpcbind version 2.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const u32 program, const u32 version,
+			  sa_family_t family,
+			  const unsigned short protocol,
+			  const unsigned short port)
+{
+	if (family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	return rpcb_register(program, version, protocol, port);
 }
 
+#endif /* CONFIG_SUNRPC_REGISTER_V4 */
+
 /**
  * svc_register - register an RPC service with the local portmapper
  * @serv: svc_serv struct for the service to register
- * @family: protocol family of service's listener socket
  * @proto: transport protocol number to advertise
  * @port: port to advertise
  *
- * Service is registered for any address in the passed-in protocol family
+ * Service is registered for any address in serv's address family
  */
-int svc_register(const struct svc_serv *serv, const int family,
-		 const unsigned short proto, const unsigned short port)
+int svc_register(const struct svc_serv *serv, const unsigned short proto,
+		 const unsigned short port)
 {
 	struct svc_program	*progp;
 	unsigned int		i;
@@ -872,15 +879,15 @@ int svc_register(const struct svc_serv *serv, const int family,
 					i,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
-					family,
+					serv->sv_family,
 					progp->pg_vers[i]->vs_hidden?
 						" (but not telling portmap)" : "");
 
 			if (progp->pg_vers[i]->vs_hidden)
 				continue;
 
-			error = __svc_register(progp->pg_name, progp->pg_prog,
-						i, family, proto, port);
+			error = __svc_register(progp->pg_prog, i,
+						serv->sv_family, proto, port);
 			if (error < 0)
 				break;
 		}
@@ -889,31 +896,38 @@ int svc_register(const struct svc_serv *serv, const int family,
 	return error;
 }
 
-/*
- * If user space is running rpcbind, it should take the v4 UNSET
- * and clear everything for this [program, version].  If user space
- * is running portmap, it will reject the v4 UNSET, but won't have
- * any "inet6" entries anyway.  So a PMAP_UNSET should be sufficient
- * in this case to clear all existing entries for [program, version].
- */
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
 static void __svc_unregister(const u32 program, const u32 version,
 			     const char *progname)
 {
+	struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= 0,
+	};
 	int error;
 
-	error = rpcb_v4_register(program, version, NULL, "");
+	error = rpcb_v4_register(program, version,
+				(struct sockaddr *)&sin6, "");
+	dprintk("svc: %s(%sv%u), error %d\n",
+			__func__, progname, version, error);
+}
 
-	/*
-	 * User space didn't support rpcbind v4, so retry this
-	 * request with the legacy rpcbind v2 protocol.
-	 */
-	if (error == -EPROTONOSUPPORT)
-		error = rpcb_register(program, version, 0, 0);
+#else	/* CONFIG_SUNRPC_REGISTER_V4 */
 
+static void __svc_unregister(const u32 program, const u32 version,
+			     const char *progname)
+{
+	int error;
+
+	error = rpcb_register(program, version, 0, 0);
 	dprintk("svc: %s(%sv%u), error %d\n",
 			__func__, progname, version, error);
 }
 
+#endif	/* CONFIG_SUNRPC_REGISTER_V4 */
+
 /*
  * All netids, bind addresses and ports registered for [program, version]
  * are removed from the local rpcbind database (if the service is not
diff --git a/trunk/net/sunrpc/svc_xprt.c b/trunk/net/sunrpc/svc_xprt.c
index 2819ee093f36..e588df5d6b34 100644
--- a/trunk/net/sunrpc/svc_xprt.c
+++ b/trunk/net/sunrpc/svc_xprt.c
@@ -161,9 +161,7 @@ EXPORT_SYMBOL_GPL(svc_xprt_init);
 
 static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 					 struct svc_serv *serv,
-					 const int family,
-					 const unsigned short port,
-					 int flags)
+					 unsigned short port, int flags)
 {
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
@@ -178,12 +176,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	struct sockaddr *sap;
 	size_t len;
 
-	switch (family) {
-	case PF_INET:
+	switch (serv->sv_family) {
+	case AF_INET:
 		sap = (struct sockaddr *)&sin;
 		len = sizeof(sin);
 		break;
-	case PF_INET6:
+	case AF_INET6:
 		sap = (struct sockaddr *)&sin6;
 		len = sizeof(sin6);
 		break;
@@ -194,8 +192,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
 }
 
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
-		    const int family, const unsigned short port,
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
 		    int flags)
 {
 	struct svc_xprt_class *xcl;
@@ -212,7 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
+		newxprt = __svc_xpo_create(xcl, serv, port, flags);
 		if (IS_ERR(newxprt)) {
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
@@ -1036,13 +1033,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
 	return dr;
 }
 
-/**
- * svc_find_xprt - find an RPC transport instance
- * @serv: pointer to svc_serv to search
- * @xcl_name: C string containing transport's class name
- * @af: Address family of transport's local address
- * @port: transport's IP port number
- *
+/*
  * Return the transport instance pointer for the endpoint accepting
  * connections/peer traffic from the specified transport class,
  * address family and port.
@@ -1051,14 +1042,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
  * wild-card, and will result in matching the first transport in the
  * service's list that has a matching class name.
  */
-struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
-			       const sa_family_t af, const unsigned short port)
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
+			       int af, int port)
 {
 	struct svc_xprt *xprt;
 	struct svc_xprt *found = NULL;
 
 	/* Sanity check the args */
-	if (serv == NULL || xcl_name == NULL)
+	if (!serv || !xcl_name)
 		return found;
 
 	spin_lock_bh(&serv->sv_lock);
@@ -1067,7 +1058,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 			continue;
 		if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
 			continue;
-		if (port != 0 && port != svc_xprt_local_port(xprt))
+		if (port && port != svc_xprt_local_port(xprt))
 			continue;
 		found = xprt;
 		svc_xprt_get(xprt);
diff --git a/trunk/net/sunrpc/svcsock.c b/trunk/net/sunrpc/svcsock.c
index 9d504234af4a..5763e6460fea 100644
--- a/trunk/net/sunrpc/svcsock.c
+++ b/trunk/net/sunrpc/svcsock.c
@@ -1110,6 +1110,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	struct svc_sock	*svsk;
 	struct sock	*inet;
 	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+	int		val;
 
 	dprintk("svc: svc_setup_socket %p\n", sock);
 	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1121,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 	/* Register socket with portmapper */
 	if (*errp >= 0 && pmap_register)
-		*errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
+		*errp = svc_register(serv, inet->sk_protocol,
 				     ntohs(inet_sk(inet)->sport));
 
 	if (*errp < 0) {
@@ -1142,6 +1143,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	else
 		svc_tcp_init(svsk, serv);
 
+	/*
+	 * We start one listener per sv_serv.  We want AF_INET
+	 * requests to be automatically shunted to our AF_INET6
+	 * listener using a mapped IPv4 address.  Make sure
+	 * no-one starts an equivalent IPv4 listener, which
+	 * would steal our incoming connections.
+	 */
+	val = 0;
+	if (serv->sv_family == AF_INET6)
+		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+					(char *)&val, sizeof(val));
+
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
 
@@ -1209,8 +1222,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	struct sockaddr_storage addr;
 	struct sockaddr *newsin = (struct sockaddr *)&addr;
 	int		newlen;
-	int		family;
-	int		val;
 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 
 	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
@@ -1222,35 +1233,14 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 				"sockets supported\n");
 		return ERR_PTR(-EINVAL);
 	}
-
 	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
-	switch (sin->sa_family) {
-	case AF_INET6:
-		family = PF_INET6;
-		break;
-	case AF_INET:
-		family = PF_INET;
-		break;
-	default:
-		return ERR_PTR(-EINVAL);
-	}
 
-	error = sock_create_kern(family, type, protocol, &sock);
+	error = sock_create_kern(sin->sa_family, type, protocol, &sock);
 	if (error < 0)
 		return ERR_PTR(error);
 
 	svc_reclassify_socket(sock);
 
-	/*
-	 * If this is an PF_INET6 listener, we want to avoid
-	 * getting requests from IPv4 remotes.  Those should
-	 * be shunted to a PF_INET listener via rpcbind.
-	 */
-	val = 1;
-	if (family == PF_INET6)
-		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
-					(char *)&val, sizeof(val));
-
 	if (type == SOCK_STREAM)
 		sock->sk->sk_reuse = 1;		/* allow address reuse */
 	error = kernel_bind(sock, sin, len);
diff --git a/trunk/net/sunrpc/xprt.c b/trunk/net/sunrpc/xprt.c
index a0bfe53f1621..62098d101a1f 100644
--- a/trunk/net/sunrpc/xprt.c
+++ b/trunk/net/sunrpc/xprt.c
@@ -151,37 +151,6 @@ int xprt_unregister_transport(struct xprt_class *transport)
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
-/**
- * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
- *
- * Returns:
- * 0:		transport successfully loaded
- * -ENOENT:	transport module not available
- */
-int xprt_load_transport(const char *transport_name)
-{
-	struct xprt_class *t;
-	char module_name[sizeof t->name + 5];
-	int result;
-
-	result = 0;
-	spin_lock(&xprt_list_lock);
-	list_for_each_entry(t, &xprt_list, list) {
-		if (strcmp(t->name, transport_name) == 0) {
-			spin_unlock(&xprt_list_lock);
-			goto out;
-		}
-	}
-	spin_unlock(&xprt_list_lock);
-	strcpy(module_name, "xprt");
-	strncat(module_name, transport_name, sizeof t->name);
-	result = request_module(module_name);
-out:
-	return result;
-}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
-
 /**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
@@ -611,7 +580,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
 	dprintk("RPC:       disconnected transport %p\n", xprt);
 	spin_lock_bh(&xprt->transport_lock);
 	xprt_clear_connected(xprt);
-	xprt_wake_pending_tasks(xprt, -EAGAIN);
+	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
 EXPORT_SYMBOL_GPL(xprt_disconnect_done);
@@ -629,7 +598,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
 		queue_work(rpciod_workqueue, &xprt->task_cleanup);
-	xprt_wake_pending_tasks(xprt, -EAGAIN);
+	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
@@ -656,7 +625,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
 		queue_work(rpciod_workqueue, &xprt->task_cleanup);
-	xprt_wake_pending_tasks(xprt, -EAGAIN);
+	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 out:
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -726,8 +695,9 @@ static void xprt_connect_status(struct rpc_task *task)
 	}
 
 	switch (task->tk_status) {
-	case -EAGAIN:
-		dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
+	case -ENOTCONN:
+		dprintk("RPC: %5u xprt_connect_status: connection broken\n",
+				task->tk_pid);
 		break;
 	case -ETIMEDOUT:
 		dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
@@ -848,8 +818,15 @@ int xprt_prepare_transmit(struct rpc_task *task)
 		err = req->rq_received;
 		goto out_unlock;
 	}
-	if (!xprt->ops->reserve_xprt(task))
+	if (!xprt->ops->reserve_xprt(task)) {
 		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	if (!xprt_connected(xprt)) {
+		err = -ENOTCONN;
+		goto out_unlock;
+	}
 out_unlock:
 	spin_unlock_bh(&xprt->transport_lock);
 	return err;
@@ -893,26 +870,32 @@ void xprt_transmit(struct rpc_task *task)
 	req->rq_connect_cookie = xprt->connect_cookie;
 	req->rq_xtime = jiffies;
 	status = xprt->ops->send_request(task);
-	if (status != 0) {
-		task->tk_status = status;
-		return;
-	}
+	if (status == 0) {
+		dprintk("RPC: %5u xmit complete\n", task->tk_pid);
+		spin_lock_bh(&xprt->transport_lock);
 
-	dprintk("RPC: %5u xmit complete\n", task->tk_pid);
-	spin_lock_bh(&xprt->transport_lock);
+		xprt->ops->set_retrans_timeout(task);
 
-	xprt->ops->set_retrans_timeout(task);
+		xprt->stat.sends++;
+		xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
+		xprt->stat.bklog_u += xprt->backlog.qlen;
 
-	xprt->stat.sends++;
-	xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
-	xprt->stat.bklog_u += xprt->backlog.qlen;
+		/* Don't race with disconnect */
+		if (!xprt_connected(xprt))
+			task->tk_status = -ENOTCONN;
+		else if (!req->rq_received)
+			rpc_sleep_on(&xprt->pending, task, xprt_timer);
+		spin_unlock_bh(&xprt->transport_lock);
+		return;
+	}
 
-	/* Don't race with disconnect */
-	if (!xprt_connected(xprt))
-		task->tk_status = -ENOTCONN;
-	else if (!req->rq_received)
-		rpc_sleep_on(&xprt->pending, task, xprt_timer);
-	spin_unlock_bh(&xprt->transport_lock);
+	/* Note: at this point, task->tk_sleeping has not yet been set,
+	 *	 hence there is no danger of the waking up task being put on
+	 *	 schedq, and being picked up by a parallel run of rpciod().
+	 */
+	task->tk_status = status;
+	if (status == -ECONNREFUSED)
+		rpc_sleep_on(&xprt->sending, task, NULL);
 }
 
 static inline void do_xprt_reserve(struct rpc_task *task)
diff --git a/trunk/net/sunrpc/xprtrdma/rpc_rdma.c b/trunk/net/sunrpc/xprtrdma/rpc_rdma.c
index e5e28d1946a4..14106d26bb95 100644
--- a/trunk/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/trunk/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -310,19 +310,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 		__func__, pad, destp, rqst->rq_slen, curlen);
 
 	copy_len = rqst->rq_snd_buf.page_len;
-
-	if (rqst->rq_snd_buf.tail[0].iov_len) {
-		curlen = rqst->rq_snd_buf.tail[0].iov_len;
-		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
-			memmove(destp + copy_len,
-				rqst->rq_snd_buf.tail[0].iov_base, curlen);
-			r_xprt->rx_stats.pullup_copy_count += curlen;
-		}
-		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
-			__func__, destp + copy_len, curlen);
-		rqst->rq_svec[0].iov_len += curlen;
-	}
-
 	r_xprt->rx_stats.pullup_copy_count += copy_len;
 	npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
 	for (i = 0; copy_len && i < npages; i++) {
@@ -345,6 +332,17 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 		destp += curlen;
 		copy_len -= curlen;
 	}
+	if (rqst->rq_snd_buf.tail[0].iov_len) {
+		curlen = rqst->rq_snd_buf.tail[0].iov_len;
+		if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
+			memcpy(destp,
+				rqst->rq_snd_buf.tail[0].iov_base, curlen);
+			r_xprt->rx_stats.pullup_copy_count += curlen;
+		}
+		dprintk("RPC:       %s: tail destp 0x%p len %d curlen %d\n",
+			__func__, destp, copy_len, curlen);
+		rqst->rq_svec[0].iov_len += curlen;
+	}
 	/* header now contains entire send message */
 	return pad;
 }
@@ -658,7 +656,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
 			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
 		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
-			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+			memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
 		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
 			__func__, srcp, copy_len, curlen);
 		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
diff --git a/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 6c26a675435a..a3334e3b73cc 100644
--- a/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -191,6 +191,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 		   struct xdr_buf *xdr,
 		   struct svc_rdma_req_map *vec)
 {
+	int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
 	int sge_no;
 	u32 sge_bytes;
 	u32 page_bytes;
@@ -234,11 +235,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 		sge_no++;
 	}
 
-	dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
-		"page_base %u page_len %u head_len %zu tail_len %zu\n",
-		sge_no, page_no, xdr->page_base, xdr->page_len,
-		xdr->head[0].iov_len, xdr->tail[0].iov_len);
-
+	BUG_ON(sge_no > sge_max);
 	vec->count = sge_no;
 	return 0;
 }
@@ -582,6 +579,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 			ctxt->sge[page_no+1].length = 0;
 	}
 	BUG_ON(sge_no > rdma->sc_max_sge);
+	BUG_ON(sge_no > ctxt->count);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
 	send_wr.wr_id = (unsigned long)ctxt;
diff --git a/trunk/net/sunrpc/xprtsock.c b/trunk/net/sunrpc/xprtsock.c
index d40ff50887aa..568330eebbfe 100644
--- a/trunk/net/sunrpc/xprtsock.c
+++ b/trunk/net/sunrpc/xprtsock.c
@@ -49,9 +49,6 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
 unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
 unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
 
-#define XS_TCP_LINGER_TO	(15U * HZ)
-static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
-
 /*
  * We can register our own files under /proc/sys/sunrpc by
  * calling register_sysctl_table() again.  The files in that
@@ -119,14 +116,6 @@ static ctl_table xs_tunables_table[] = {
 		.extra1		= &xprt_min_resvport_limit,
 		.extra2		= &xprt_max_resvport_limit
 	},
-	{
-		.procname	= "tcp_fin_timeout",
-		.data		= &xs_tcp_fin_timeout,
-		.maxlen		= sizeof(xs_tcp_fin_timeout),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= sysctl_jiffies
-	},
 	{
 		.ctl_name = 0,
 	},
@@ -532,12 +521,11 @@ static void xs_nospace_callback(struct rpc_task *task)
  * @task: task to put to sleep
  *
  */
-static int xs_nospace(struct rpc_task *task)
+static void xs_nospace(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-	int ret = 0;
 
 	dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
 			task->tk_pid, req->rq_slen - req->rq_bytes_sent,
@@ -549,7 +537,6 @@ static int xs_nospace(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
 		if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
-			ret = -EAGAIN;
 			/*
 			 * Notify TCP that we're limited by the application
 			 * window size
@@ -561,11 +548,10 @@ static int xs_nospace(struct rpc_task *task)
 		}
 	} else {
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-		ret = -ENOTCONN;
+		task->tk_status = -ENOTCONN;
 	}
 
 	spin_unlock_bh(&xprt->transport_lock);
-	return ret;
 }
 
 /**
@@ -608,8 +594,6 @@ static int xs_udp_send_request(struct rpc_task *task)
 		/* Still some bytes left; set up for a retry later. */
 		status = -EAGAIN;
 	}
-	if (!transport->sock)
-		goto out;
 
 	switch (status) {
 	case -ENOTSOCK:
@@ -617,19 +601,21 @@ static int xs_udp_send_request(struct rpc_task *task)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_nospace(task);
+		xs_nospace(task);
 		break;
-	default:
-		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-			-status);
 	case -ENETUNREACH:
 	case -EPIPE:
 	case -ECONNREFUSED:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		break;
+	default:
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
 	}
-out:
+
 	return status;
 }
 
@@ -711,8 +697,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		status = -EAGAIN;
 		break;
 	}
-	if (!transport->sock)
-		goto out;
 
 	switch (status) {
 	case -ENOTSOCK:
@@ -720,19 +704,23 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_nospace(task);
+		xs_nospace(task);
 		break;
-	default:
-		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-			-status);
 	case -ECONNRESET:
-	case -EPIPE:
 		xs_tcp_shutdown(xprt);
 	case -ECONNREFUSED:
 	case -ENOTCONN:
+	case -EPIPE:
+		status = -ENOTCONN;
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		xs_tcp_shutdown(xprt);
 	}
-out:
+
 	return status;
 }
 
@@ -779,13 +767,23 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
 	sk->sk_error_report = transport->old_error_report;
 }
 
-static void xs_reset_transport(struct sock_xprt *transport)
+/**
+ * xs_close - close a socket
+ * @xprt: transport
+ *
+ * This is used when all requests are complete; ie, no DRC state remains
+ * on the server we want to save.
+ */
+static void xs_close(struct rpc_xprt *xprt)
 {
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct socket *sock = transport->sock;
 	struct sock *sk = transport->inet;
 
-	if (sk == NULL)
-		return;
+	if (!sk)
+		goto clear_close_wait;
+
+	dprintk("RPC:       xs_close xprt %p\n", xprt);
 
 	write_lock_bh(&sk->sk_callback_lock);
 	transport->inet = NULL;
@@ -799,25 +797,8 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	sk->sk_no_check = 0;
 
 	sock_release(sock);
-}
-
-/**
- * xs_close - close a socket
- * @xprt: transport
- *
- * This is used when all requests are complete; ie, no DRC state remains
- * on the server we want to save.
- */
-static void xs_close(struct rpc_xprt *xprt)
-{
-	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-
-	dprintk("RPC:       xs_close xprt %p\n", xprt);
-
-	xs_reset_transport(transport);
-
+clear_close_wait:
 	smp_mb__before_clear_bit();
-	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	smp_mb__after_clear_bit();
@@ -1145,47 +1126,6 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 	read_unlock(&sk->sk_callback_lock);
 }
 
-/*
- * Do the equivalent of linger/linger2 handling for dealing with
- * broken servers that don't close the socket in a timely
- * fashion
- */
-static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
-		unsigned long timeout)
-{
-	struct sock_xprt *transport;
-
-	if (xprt_test_and_set_connecting(xprt))
-		return;
-	set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
-	transport = container_of(xprt, struct sock_xprt, xprt);
-	queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
-			   timeout);
-}
-
-static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
-{
-	struct sock_xprt *transport;
-
-	transport = container_of(xprt, struct sock_xprt, xprt);
-
-	if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
-	    !cancel_delayed_work(&transport->connect_worker))
-		return;
-	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
-	xprt_clear_connecting(xprt);
-}
-
-static void xs_sock_mark_closed(struct rpc_xprt *xprt)
-{
-	smp_mb__before_clear_bit();
-	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
-	clear_bit(XPRT_CLOSING, &xprt->state);
-	smp_mb__after_clear_bit();
-	/* Mark transport as closed and wake up all pending tasks */
-	xprt_disconnect_done(xprt);
-}
-
 /**
  * xs_tcp_state_change - callback to handle TCP socket state changes
  * @sk: socket whose state has changed
@@ -1218,7 +1158,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			transport->tcp_flags =
 				TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
 
-			xprt_wake_pending_tasks(xprt, -EAGAIN);
+			xprt_wake_pending_tasks(xprt, 0);
 		}
 		spin_unlock_bh(&xprt->transport_lock);
 		break;
@@ -1231,10 +1171,10 @@ static void xs_tcp_state_change(struct sock *sk)
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 		smp_mb__after_clear_bit();
-		xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
+		set_bit(XPRT_CLOSING, &xprt->state);
 		xprt_force_disconnect(xprt);
 	case TCP_SYN_SENT:
 		xprt->connect_cookie++;
@@ -1247,35 +1187,40 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
 		break;
 	case TCP_LAST_ACK:
-		set_bit(XPRT_CLOSING, &xprt->state);
-		xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		smp_mb__after_clear_bit();
 		break;
 	case TCP_CLOSE:
-		xs_tcp_cancel_linger_timeout(xprt);
-		xs_sock_mark_closed(xprt);
+		smp_mb__before_clear_bit();
+		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+		clear_bit(XPRT_CLOSING, &xprt->state);
+		smp_mb__after_clear_bit();
+		/* Mark transport as closed and wake up all pending tasks */
+		xprt_disconnect_done(xprt);
 	}
  out:
 	read_unlock(&sk->sk_callback_lock);
 }
 
 /**
- * xs_error_report - callback mainly for catching socket errors
+ * xs_tcp_error_report - callback mainly for catching RST events
  * @sk: socket
  */
-static void xs_error_report(struct sock *sk)
+static void xs_tcp_error_report(struct sock *sk)
 {
 	struct rpc_xprt *xprt;
 
 	read_lock(&sk->sk_callback_lock);
+	if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED)
+		goto out;
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 	dprintk("RPC:       %s client %p...\n"
 			"RPC:       error %d\n",
 			__func__, xprt, sk->sk_err);
-	xprt_wake_pending_tasks(xprt, -EAGAIN);
+
+	xprt_force_disconnect(xprt);
 out:
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -1549,7 +1494,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_user_data = xprt;
 		sk->sk_data_ready = xs_udp_data_ready;
 		sk->sk_write_space = xs_udp_write_space;
-		sk->sk_error_report = xs_error_report;
 		sk->sk_no_check = UDP_CSUM_NORCV;
 		sk->sk_allocation = GFP_ATOMIC;
 
@@ -1582,10 +1526,9 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 		goto out;
 
 	/* Start by resetting any existing state */
-	xs_reset_transport(transport);
+	xs_close(xprt);
 
-	err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
-	if (err < 0) {
+	if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
@@ -1602,8 +1545,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
+	xprt_clear_connecting(xprt);
 }
 
 /**
@@ -1624,10 +1567,9 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 		goto out;
 
 	/* Start by resetting any existing state */
-	xs_reset_transport(transport);
+	xs_close(xprt);
 
-	err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
-	if (err < 0) {
+	if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
@@ -1644,17 +1586,18 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
+	xprt_clear_connecting(xprt);
 }
 
 /*
  * We need to preserve the port number so the reply cache on the server can
  * find our cached RPC replies when we get around to reconnecting.
  */
-static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
+static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
 {
 	int result;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct sockaddr any;
 
 	dprintk("RPC:       disconnecting xprt %p to reuse port\n", xprt);
@@ -1666,24 +1609,11 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo
 	memset(&any, 0, sizeof(any));
 	any.sa_family = AF_UNSPEC;
 	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
-	if (!result)
-		xs_sock_mark_closed(xprt);
-	else
+	if (result)
 		dprintk("RPC:       AF_UNSPEC connect return code %d\n",
 				result);
 }
 
-static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
-{
-	unsigned int state = transport->inet->sk_state;
-
-	if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED)
-		return;
-	if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT))
-		return;
-	xs_abort_connection(xprt, transport);
-}
-
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1699,7 +1629,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_data_ready = xs_tcp_data_ready;
 		sk->sk_state_change = xs_tcp_state_change;
 		sk->sk_write_space = xs_tcp_write_space;
-		sk->sk_error_report = xs_error_report;
+		sk->sk_error_report = xs_tcp_error_report;
 		sk->sk_allocation = GFP_ATOMIC;
 
 		/* socket options */
@@ -1727,42 +1657,37 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 }
 
 /**
- * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
- * @xprt: RPC transport to connect
- * @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
+ * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
+ * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
-		struct sock_xprt *transport,
-		struct socket *(*create_sock)(struct rpc_xprt *,
-			struct sock_xprt *))
+static void xs_tcp_connect_worker4(struct work_struct *work)
 {
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
 	struct socket *sock = transport->sock;
-	int status = -EIO;
+	int err, status = -EIO;
 
 	if (xprt->shutdown)
 		goto out;
 
 	if (!sock) {
-		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
-		sock = create_sock(xprt, transport);
-		if (IS_ERR(sock)) {
-			status = PTR_ERR(sock);
+		/* start from scratch */
+		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
 			goto out;
 		}
-	} else {
-		int abort_and_exit;
+		xs_reclassify_socket4(sock);
 
-		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
-				&xprt->state);
+		if (xs_bind4(transport, sock) < 0) {
+			sock_release(sock);
+			goto out;
+		}
+	} else
 		/* "close" the socket, preserving the local port */
-		xs_tcp_reuse_connection(xprt, transport);
-
-		if (abort_and_exit)
-			goto out_eagain;
-	}
+		xs_tcp_reuse_connection(xprt);
 
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
@@ -1771,104 +1696,83 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
 	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
 			xprt, -status, xprt_connected(xprt),
 			sock->sk->sk_state);
-	switch (status) {
-	case -ECONNREFUSED:
-	case -ECONNRESET:
-	case -ENETUNREACH:
-		/* retry with existing socket, after a delay */
-	case 0:
-	case -EINPROGRESS:
-	case -EALREADY:
-		xprt_clear_connecting(xprt);
-		return;
+	if (status < 0) {
+		switch (status) {
+			case -EINPROGRESS:
+			case -EALREADY:
+				goto out_clear;
+			case -ECONNREFUSED:
+			case -ECONNRESET:
+				/* retry with existing socket, after a delay */
+				break;
+			default:
+				/* get rid of existing socket, and retry */
+				xs_tcp_shutdown(xprt);
+		}
 	}
-	/* get rid of existing socket, and retry */
-	xs_tcp_shutdown(xprt);
-	printk("%s: connect returned unhandled error %d\n",
-			__func__, status);
-out_eagain:
-	status = -EAGAIN;
 out:
-	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
-}
-
-static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
-		struct sock_xprt *transport)
-{
-	struct socket *sock;
-	int err;
-
-	/* start from scratch */
-	err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err < 0) {
-		dprintk("RPC:       can't create TCP transport socket (%d).\n",
-				-err);
-		goto out_err;
-	}
-	xs_reclassify_socket4(sock);
-
-	if (xs_bind4(transport, sock) < 0) {
-		sock_release(sock);
-		goto out_err;
-	}
-	return sock;
-out_err:
-	return ERR_PTR(-EIO);
+out_clear:
+	xprt_clear_connecting(xprt);
 }
 
 /**
- * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
+ * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
  * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_connect_worker4(struct work_struct *work)
+static void xs_tcp_connect_worker6(struct work_struct *work)
 {
 	struct sock_xprt *transport =
 		container_of(work, struct sock_xprt, connect_worker.work);
 	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int err, status = -EIO;
 
-	xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4);
-}
+	if (xprt->shutdown)
+		goto out;
 
-static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
-		struct sock_xprt *transport)
-{
-	struct socket *sock;
-	int err;
-
-	/* start from scratch */
-	err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err < 0) {
-		dprintk("RPC:       can't create TCP transport socket (%d).\n",
-				-err);
-		goto out_err;
-	}
-	xs_reclassify_socket6(sock);
+	if (!sock) {
+		/* start from scratch */
+		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
+			goto out;
+		}
+		xs_reclassify_socket6(sock);
 
-	if (xs_bind6(transport, sock) < 0) {
-		sock_release(sock);
-		goto out_err;
-	}
-	return sock;
-out_err:
-	return ERR_PTR(-EIO);
-}
+		if (xs_bind6(transport, sock) < 0) {
+			sock_release(sock);
+			goto out;
+		}
+	} else
+		/* "close" the socket, preserving the local port */
+		xs_tcp_reuse_connection(xprt);
 
-/**
- * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
- * @work: RPC transport to connect
- *
- * Invoked by a work queue tasklet.
- */
-static void xs_tcp_connect_worker6(struct work_struct *work)
-{
-	struct sock_xprt *transport =
-		container_of(work, struct sock_xprt, connect_worker.work);
-	struct rpc_xprt *xprt = &transport->xprt;
+	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6);
+	status = xs_tcp_finish_connecting(xprt, sock);
+	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
+			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
+	if (status < 0) {
+		switch (status) {
+			case -EINPROGRESS:
+			case -EALREADY:
+				goto out_clear;
+			case -ECONNREFUSED:
+			case -ECONNRESET:
+				/* retry with existing socket, after a delay */
+				break;
+			default:
+				/* get rid of existing socket, and retry */
+				xs_tcp_shutdown(xprt);
+		}
+	}
+out:
+	xprt_wake_pending_tasks(xprt, status);
+out_clear:
+	xprt_clear_connecting(xprt);
 }
 
 /**
@@ -1913,6 +1817,9 @@ static void xs_tcp_connect(struct rpc_task *task)
 {
 	struct rpc_xprt *xprt = task->tk_xprt;
 
+	/* Initiate graceful shutdown of the socket if not already done */
+	if (test_bit(XPRT_CONNECTED, &xprt->state))
+		xs_tcp_shutdown(xprt);
 	/* Exit if we need to wait for socket shutdown to complete */
 	if (test_bit(XPRT_CLOSING, &xprt->state))
 		return;