diff --git a/[refs] b/[refs]
index 5b68eff60de7..d21761059362 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: caab36b593b44c97e3c7707c6a8054b320f8d622
+refs/heads/master: 540aca06b737cc38965b52eeceefba3d24376461
diff --git a/trunk/Makefile b/trunk/Makefile
index c40d83aedebe..27fb890a2bff 100644
--- a/trunk/Makefile
+++ b/trunk/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 29
-EXTRAVERSION = -rc7
+EXTRAVERSION = -rc6
 NAME = Erotic Pickled Herring
 
 # *DOCUMENTATION*
diff --git a/trunk/arch/alpha/mm/init.c b/trunk/arch/alpha/mm/init.c
index af71d38c8e41..5d7a16eab312 100644
--- a/trunk/arch/alpha/mm/init.c
+++ b/trunk/arch/alpha/mm/init.c
@@ -189,21 +189,9 @@ callback_init(void * kernel_end)
 
 	if (alpha_using_srm) {
 		static struct vm_struct console_remap_vm;
-		unsigned long nr_pages = 0;
-		unsigned long vaddr;
+		unsigned long vaddr = VMALLOC_START;
 		unsigned long i, j;
 
-		/* calculate needed size */
-		for (i = 0; i < crb->map_entries; ++i)
-			nr_pages += crb->map[i].count;
-
-		/* register the vm area */
-		console_remap_vm.flags = VM_ALLOC;
-		console_remap_vm.size = nr_pages << PAGE_SHIFT;
-		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
-
-		vaddr = (unsigned long)console_remap_vm.addr;
-
 		/* Set up the third level PTEs and update the virtual
 		   addresses of the CRB entries.  */
 		for (i = 0; i < crb->map_entries; ++i) {
@@ -225,6 +213,12 @@ callback_init(void * kernel_end)
 				vaddr += PAGE_SIZE;
 			}
 		}
+
+		/* Let vmalloc know that we've allocated some space.  */
+		console_remap_vm.flags = VM_ALLOC;
+		console_remap_vm.addr = (void *) VMALLOC_START;
+		console_remap_vm.size = vaddr - VMALLOC_START;
+		vmlist = &console_remap_vm;
 	}
 
 	callback_init_done = 1;
diff --git a/trunk/arch/arm/kernel/setup.c b/trunk/arch/arm/kernel/setup.c
index 68d6494c0389..7049815d66d5 100644
--- a/trunk/arch/arm/kernel/setup.c
+++ b/trunk/arch/arm/kernel/setup.c
@@ -233,13 +233,12 @@ static void __init cacheid_init(void)
 	unsigned int cachetype = read_cpuid_cachetype();
 	unsigned int arch = cpu_architecture();
 
-	if (arch >= CPU_ARCH_ARMv6) {
-		if ((cachetype & (7 << 29)) == 4 << 29) {
-			/* ARMv7 register format */
-			cacheid = CACHEID_VIPT_NONALIASING;
-			if ((cachetype & (3 << 14)) == 1 << 14)
-				cacheid |= CACHEID_ASID_TAGGED;
-		} else if (cachetype & (1 << 23))
+	if (arch >= CPU_ARCH_ARMv7) {
+		cacheid = CACHEID_VIPT_NONALIASING;
+		if ((cachetype & (3 << 14)) == 1 << 14)
+			cacheid |= CACHEID_ASID_TAGGED;
+	} else if (arch >= CPU_ARCH_ARMv6) {
+		if (cachetype & (1 << 23))
 			cacheid = CACHEID_VIPT_ALIASING;
 		else
 			cacheid = CACHEID_VIPT_NONALIASING;
diff --git a/trunk/arch/arm/mach-at91/pm.c b/trunk/arch/arm/mach-at91/pm.c
index 7ac812dc055a..9bb4f043aa22 100644
--- a/trunk/arch/arm/mach-at91/pm.c
+++ b/trunk/arch/arm/mach-at91/pm.c
@@ -332,6 +332,7 @@ static int at91_pm_enter(suspend_state_t state)
 			at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR));
 
 error:
+	sdram_selfrefresh_disable();
 	target_state = PM_SUSPEND_ON;
 	at91_irq_resume();
 	at91_gpio_resume();
diff --git a/trunk/arch/arm/mm/abort-ev6.S b/trunk/arch/arm/mm/abort-ev6.S
index 94077fbd96b7..8a7f65ba14b7 100644
--- a/trunk/arch/arm/mm/abort-ev6.S
+++ b/trunk/arch/arm/mm/abort-ev6.S
@@ -23,8 +23,7 @@ ENTRY(v6_early_abort)
 #ifdef CONFIG_CPU_32v6K
 	clrex
 #else
-	sub	r1, sp, #4			@ Get unused stack location
-	strex	r0, r1, [r1]			@ Clear the exclusive monitor
+	strex	r0, r1, [sp]			@ Clear the exclusive monitor
 #endif
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
diff --git a/trunk/arch/arm/plat-s3c64xx/irq-eint.c b/trunk/arch/arm/plat-s3c64xx/irq-eint.c
index ebb305ce7689..1f7cc0067f5c 100644
--- a/trunk/arch/arm/plat-s3c64xx/irq-eint.c
+++ b/trunk/arch/arm/plat-s3c64xx/irq-eint.c
@@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq)
 	u32 mask;
 
 	mask = __raw_readl(S3C64XX_EINT0MASK);
-	mask &= ~eint_irq_to_bit(irq);
+	mask |= eint_irq_to_bit(irq);
 	__raw_writel(mask, S3C64XX_EINT0MASK);
 }
 
diff --git a/trunk/arch/avr32/Kconfig b/trunk/arch/avr32/Kconfig
index 05fe3053dcae..b189680d18b0 100644
--- a/trunk/arch/avr32/Kconfig
+++ b/trunk/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
 	def_bool y
 
-config HAVE_ARCH_BOOTMEM
+config HAVE_ARCH_BOOTMEM_NODE
 	def_bool n
 
 config ARCH_HAVE_MEMORY_PRESENT
diff --git a/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c b/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c
index d6b772ba3b8f..fb371f5ce132 100644
--- a/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c
+++ b/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c
@@ -142,10 +142,6 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev)
 {
 	unsigned int val;
 
-	/* Do not do the fixup on other platforms! */
-	if (!machine_is(gef_sbc610))
-		return;
-
 	printk(KERN_INFO "Running NEC uPD720101 Fixup\n");
 
 	/* Ensure ports 1, 2, 3, 4 & 5 are enabled */
diff --git a/trunk/arch/s390/crypto/aes_s390.c b/trunk/arch/s390/crypto/aes_s390.c
index 6118890c946d..c42cd898f68b 100644
--- a/trunk/arch/s390/crypto/aes_s390.c
+++ b/trunk/arch/s390/crypto/aes_s390.c
@@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void)
 module_init(aes_s390_init);
 module_exit(aes_s390_fini);
 
-MODULE_ALIAS("aes-all");
+MODULE_ALIAS("aes");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
 MODULE_LICENSE("GPL");
diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig
index 31758378bcd2..469f3450bf81 100644
--- a/trunk/arch/x86/Kconfig
+++ b/trunk/arch/x86/Kconfig
@@ -138,9 +138,6 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
-config HAVE_DYNAMIC_PER_CPU_AREA
-	def_bool y
-
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
@@ -783,11 +780,6 @@ config X86_MCE_AMD
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
-config X86_MCE_THRESHOLD
-	depends on X86_MCE_AMD || X86_MCE_INTEL
-	bool
-	default y
-
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
 	depends on X86_32 && X86_MCE
@@ -1133,7 +1125,7 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accomodate various tables.
 
-config HAVE_ARCH_BOOTMEM
+config HAVE_ARCH_BOOTMEM_NODE
 	def_bool y
 	depends on X86_32 && NUMA
 
diff --git a/trunk/arch/x86/include/asm/apicdef.h b/trunk/arch/x86/include/asm/apicdef.h
index bc9514fb3b13..63134e31e8b9 100644
--- a/trunk/arch/x86/include/asm/apicdef.h
+++ b/trunk/arch/x86/include/asm/apicdef.h
@@ -53,7 +53,6 @@
 #define		APIC_ESR_SENDILL	0x00020
 #define		APIC_ESR_RECVILL	0x00040
 #define		APIC_ESR_ILLREGA	0x00080
-#define 	APIC_LVTCMCI	0x2f0
 #define	APIC_ICR	0x300
 #define		APIC_DEST_SELF		0x40000
 #define		APIC_DEST_ALLINC	0x80000
diff --git a/trunk/arch/x86/include/asm/cacheflush.h b/trunk/arch/x86/include/asm/cacheflush.h
index 5b301b7ff5f4..2f8466540fb5 100644
--- a/trunk/arch/x86/include/asm/cacheflush.h
+++ b/trunk/arch/x86/include/asm/cacheflush.h
@@ -5,43 +5,24 @@
 #include <linux/mm.h>
 
 /* Caches aren't brain-dead on the intel. */
-static inline void flush_cache_all(void) { }
-static inline void flush_cache_mm(struct mm_struct *mm) { }
-static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
-static inline void flush_cache_range(struct vm_area_struct *vma,
-				     unsigned long start, unsigned long end) { }
-static inline void flush_cache_page(struct vm_area_struct *vma,
-				    unsigned long vmaddr, unsigned long pfn) { }
-static inline void flush_dcache_page(struct page *page) { }
-static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
-static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
-static inline void flush_icache_range(unsigned long start,
-				      unsigned long end) { }
-static inline void flush_icache_page(struct vm_area_struct *vma,
-				     struct page *page) { }
-static inline void flush_icache_user_range(struct vm_area_struct *vma,
-					   struct page *page,
-					   unsigned long addr,
-					   unsigned long len) { }
-static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
-static inline void flush_cache_vunmap(unsigned long start,
-				      unsigned long end) { }
+#define flush_cache_all()			do { } while (0)
+#define flush_cache_mm(mm)			do { } while (0)
+#define flush_cache_dup_mm(mm)			do { } while (0)
+#define flush_cache_range(vma, start, end)	do { } while (0)
+#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define flush_dcache_page(page)			do { } while (0)
+#define flush_dcache_mmap_lock(mapping)		do { } while (0)
+#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
+#define flush_icache_range(start, end)		do { } while (0)
+#define flush_icache_page(vma, pg)		do { } while (0)
+#define flush_icache_user_range(vma, pg, adr, len)	do { } while (0)
+#define flush_cache_vmap(start, end)		do { } while (0)
+#define flush_cache_vunmap(start, end)		do { } while (0)
 
-static inline void copy_to_user_page(struct vm_area_struct *vma,
-				     struct page *page, unsigned long vaddr,
-				     void *dst, const void *src,
-				     unsigned long len)
-{
-	memcpy(dst, src, len);
-}
-
-static inline void copy_from_user_page(struct vm_area_struct *vma,
-				       struct page *page, unsigned long vaddr,
-				       void *dst, const void *src,
-				       unsigned long len)
-{
-	memcpy(dst, src, len);
-}
+#define copy_to_user_page(vma, page, vaddr, dst, src, len)	\
+	memcpy((dst), (src), (len))
+#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
+	memcpy((dst), (src), (len))
 
 #define PG_non_WB				PG_arch_1
 PAGEFLAG(NonWB, non_WB)
diff --git a/trunk/arch/x86/include/asm/efi.h b/trunk/arch/x86/include/asm/efi.h
index edc90f23e708..ca5ffb2856b6 100644
--- a/trunk/arch/x86/include/asm/efi.h
+++ b/trunk/arch/x86/include/asm/efi.h
@@ -37,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
+#define MAX_EFI_IO_PAGES	100
+
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/trunk/arch/x86/include/asm/fixmap.h b/trunk/arch/x86/include/asm/fixmap.h
index 63a79c77d220..dca8f03da5b2 100644
--- a/trunk/arch/x86/include/asm/fixmap.h
+++ b/trunk/arch/x86/include/asm/fixmap.h
@@ -24,6 +24,9 @@
 #include <asm/kmap_types.h>
 #else
 #include <asm/vsyscall.h>
+#ifdef CONFIG_EFI
+#include <asm/efi.h>
+#endif
 #endif
 
 /*
@@ -89,6 +92,13 @@ enum fixed_addresses {
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_EFI
+	FIX_EFI_IO_MAP_LAST_PAGE,
+	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
+				  + MAX_EFI_IO_PAGES - 1,
+#endif
+#endif
 #ifdef CONFIG_X86_VISWS_APIC
 	FIX_CO_CPU,	/* Cobalt timer */
 	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
diff --git a/trunk/arch/x86/include/asm/i387.h b/trunk/arch/x86/include/asm/i387.h
index 71c9e5183982..48f0004db8c9 100644
--- a/trunk/arch/x86/include/asm/i387.h
+++ b/trunk/arch/x86/include/asm/i387.h
@@ -172,13 +172,7 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 
 #else  /* CONFIG_X86_32 */
 
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_task(struct task_struct *tsk);
-#else
-static inline void finit_task(struct task_struct *tsk)
-{
-}
-#endif
+extern void finit(void);
 
 static inline void tolerant_fwait(void)
 {
diff --git a/trunk/arch/x86/include/asm/io.h b/trunk/arch/x86/include/asm/io.h
index e5383e3d2f8c..683d0b4c00fc 100644
--- a/trunk/arch/x86/include/asm/io.h
+++ b/trunk/arch/x86/include/asm/io.h
@@ -172,6 +172,8 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 
 extern void iounmap(volatile void __iomem *addr);
 
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
 
 #ifdef CONFIG_X86_32
 # include "io_32.h"
@@ -196,6 +198,7 @@ extern void early_ioremap_reset(void);
 extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
 extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 #define IO_SPACE_LIMIT 0xffff
 
diff --git a/trunk/arch/x86/include/asm/mce.h b/trunk/arch/x86/include/asm/mce.h
index 563933e06a35..32c6e17b960b 100644
--- a/trunk/arch/x86/include/asm/mce.h
+++ b/trunk/arch/x86/include/asm/mce.h
@@ -11,8 +11,6 @@
  */
 
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
-#define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
-#define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
@@ -92,29 +90,14 @@ extern int mce_disabled;
 
 #include <asm/atomic.h>
 
-void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
-/*
- * To support more than 128 would need to escape the predefined
- * Linux defined extended banks first.
- */
-#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
-
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
-void cmci_clear(void);
-void cmci_reenable(void);
-void cmci_rediscover(int dying);
-void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
-static inline void cmci_clear(void) {}
-static inline void cmci_reenable(void) {}
-static inline void cmci_rediscover(int dying) {}
-static inline void cmci_recheck(void) {}
 #endif
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -123,23 +106,11 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
-extern int mce_available(struct cpuinfo_x86 *c);
-
-void mce_log_therm_throt_event(__u64 status);
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
 
 extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
-
-typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
-DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
-
-enum mcp_flags {
-	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
-	MCP_UC = (1 << 1),		/* log uncorrected errors */
-};
-extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
-
 extern int mce_notify_user(void);
 
 #endif /* !CONFIG_X86_32 */
@@ -149,8 +120,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif
-
-extern void (*mce_threshold_vector)(void);
+extern void stop_mce(void);
+extern void restart_mce(void);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/trunk/arch/x86/include/asm/mmzone_32.h b/trunk/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..105fb90a0635 100644
--- a/trunk/arch/x86/include/asm/mmzone_32.h
+++ b/trunk/arch/x86/include/asm/mmzone_32.h
@@ -91,9 +91,46 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-/* always use node 0 for bootmem on this numa platform */
-#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)	\
-	(NODE_DATA(0)->bdata)
+
+/*
+ * Following are macros that are specific to this numa platform.
+ */
+#define reserve_bootmem(addr, size, flags) \
+	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
+#define alloc_bootmem(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_nopanic(x) \
+	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
+				__pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_pages(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_nopanic(x) \
+	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
+				__pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low_pages(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
+#define alloc_bootmem_node(pgdat, x)					\
+({									\
+	struct pglist_data  __maybe_unused			\
+				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
+						__pa(MAX_DMA_ADDRESS));	\
+})
+#define alloc_bootmem_pages_node(pgdat, x)				\
+({									\
+	struct pglist_data  __maybe_unused			\
+				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
+						__pa(MAX_DMA_ADDRESS));	\
+})
+#define alloc_bootmem_low_pages_node(pgdat, x)				\
+({									\
+	struct pglist_data  __maybe_unused			\
+				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
+})
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #endif /* _ASM_X86_MMZONE_32_H */
diff --git a/trunk/arch/x86/include/asm/msr-index.h b/trunk/arch/x86/include/asm/msr-index.h
index 2dbd2314139e..358acc59ae04 100644
--- a/trunk/arch/x86/include/asm/msr-index.h
+++ b/trunk/arch/x86/include/asm/msr-index.h
@@ -77,11 +77,6 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
-/* These are consecutive and not in the normal 4er MCE bank block */
-#define MSR_IA32_MC0_CTL2		0x00000280
-#define CMCI_EN			(1ULL << 30)
-#define CMCI_THRESHOLD_MASK		0xffffULL
-
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/trunk/arch/x86/include/asm/percpu.h b/trunk/arch/x86/include/asm/percpu.h
index 8f1d2fbec1d4..aee103b26d01 100644
--- a/trunk/arch/x86/include/asm/percpu.h
+++ b/trunk/arch/x86/include/asm/percpu.h
@@ -43,14 +43,6 @@
 #else /* ...!ASSEMBLY */
 
 #include <linux/stringify.h>
-#include <asm/sections.h>
-
-#define __addr_to_pcpu_ptr(addr)					\
-	(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr	\
-		 + (unsigned long)__per_cpu_start)
-#define __pcpu_ptr_to_addr(ptr)						\
-	(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr	\
-		 - (unsigned long)__per_cpu_start)
 
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h
index d0812e155f1d..1c097a3a6669 100644
--- a/trunk/arch/x86/include/asm/pgtable.h
+++ b/trunk/arch/x86/include/asm/pgtable.h
@@ -288,8 +288,6 @@ static inline int is_new_memtype_allowed(unsigned long flags,
 	return 1;
 }
 
-pmd_t *populate_extra_pmd(unsigned long vaddr);
-pte_t *populate_extra_pte(unsigned long vaddr);
 #endif	/* __ASSEMBLY__ */
 
 #ifdef CONFIG_X86_32
diff --git a/trunk/arch/x86/include/asm/xen/page.h b/trunk/arch/x86/include/asm/xen/page.h
index 1a918dde46b5..4bd990ee43df 100644
--- a/trunk/arch/x86/include/asm/xen/page.h
+++ b/trunk/arch/x86/include/asm/xen/page.h
@@ -164,7 +164,6 @@ static inline pte_t __pte_ma(pteval_t x)
 
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
-unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
diff --git a/trunk/arch/x86/kernel/alternative.c b/trunk/arch/x86/kernel/alternative.c
index 4c80f1557433..6907b8e85d52 100644
--- a/trunk/arch/x86/kernel/alternative.c
+++ b/trunk/arch/x86/kernel/alternative.c
@@ -414,17 +414,9 @@ void __init alternative_instructions(void)
 	   that might execute the to be patched code.
 	   Other CPUs are not running. */
 	stop_nmi();
-
-	/*
-	 * Don't stop machine check exceptions while patching.
-	 * MCEs only happen when something got corrupted and in this
-	 * case we must do something about the corruption.
-	 * Ignoring it is worse than a unlikely patching race.
-	 * Also machine checks tend to be broadcast and if one CPU
-	 * goes into machine check the others follow quickly, so we don't
-	 * expect a machine check to cause undue problems during to code
-	 * patching.
-	 */
+#ifdef CONFIG_X86_MCE
+	stop_mce();
+#endif
 
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -464,6 +456,9 @@ void __init alternative_instructions(void)
 				(unsigned long)__smp_locks_end);
 
 	restart_nmi();
+#ifdef CONFIG_X86_MCE
+	restart_mce();
+#endif
 }
 
 /**
diff --git a/trunk/arch/x86/kernel/apic/apic.c b/trunk/arch/x86/kernel/apic/apic.c
index 30909a258d0f..f9cecdfd05c5 100644
--- a/trunk/arch/x86/kernel/apic/apic.c
+++ b/trunk/arch/x86/kernel/apic/apic.c
@@ -46,7 +46,6 @@
 #include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/smp.h>
-#include <asm/mce.h>
 
 unsigned int num_processors;
 
@@ -843,14 +842,6 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 #endif
-#ifdef CONFIG_X86_MCE_INTEL
-	if (maxlvt >= 6) {
-		v = apic_read(APIC_LVTCMCI);
-		if (!(v & APIC_LVT_MASKED))
-			apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
-	}
-#endif
-
 	/*
 	 * Clean APIC state for other OSs:
 	 */
@@ -1250,12 +1241,6 @@ void __cpuinit setup_local_APIC(void)
 	apic_write(APIC_LVT1, value);
 
 	preempt_enable();
-
-#ifdef CONFIG_X86_MCE_INTEL
-	/* Recheck CMCI information after local APIC is up on CPU #0 */
-	if (smp_processor_id() == 0)
-		cmci_recheck();
-#endif
 }
 
 void __cpuinit end_local_APIC_setup(void)
diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 22590cf688ae..4b1c319d30c3 100644
--- a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
-	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/Makefile b/trunk/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..d7d2323bbb69 100644
--- a/trunk/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/trunk/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,4 +4,3 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
-obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c
index 3552119b091d..dfaebce3633e 100644
--- a/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	}
 }
 
+static unsigned long old_cr4 __initdata;
+
+void __init stop_mce(void)
+{
+	old_cr4 = read_cr4();
+	clear_in_cr4(X86_CR4_MCE);
+}
+
+void __init restart_mce(void)
+{
+	if (old_cr4 & X86_CR4_MCE)
+		set_in_cr4(X86_CR4_MCE);
+}
+
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c
index bfbd5323a635..fe79985ce0f2 100644
--- a/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,8 +3,6 @@
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -26,9 +24,6 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -37,6 +32,7 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
+#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -51,7 +47,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static u64 *bank;
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -62,19 +58,6 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
-/* MCA banks polled by the period polling timer for corrected events */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
-	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
-	memset(m, 0, sizeof(struct mce));
-	m->cpu = smp_processor_id();
-	rdtscll(m->tsc);
-}
-
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -136,11 +119,11 @@ static void print_mce(struct mce *m)
 			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
-	printk(KERN_EMERG "TSC %llx ", m->tsc);
+	printk(KERN_EMERG "TSC %Lx ", m->tsc);
 	if (m->addr)
-		printk("ADDR %llx ", m->addr);
+		printk("ADDR %Lx ", m->addr);
 	if (m->misc)
-		printk("MISC %llx ", m->misc);
+		printk("MISC %Lx ", m->misc);
 	printk("\n");
 	printk(KERN_EMERG "This is not a software problem!\n");
 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -166,10 +149,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 	panic(msg);
 }
 
-int mce_available(struct cpuinfo_x86 *c)
+static int mce_available(struct cpuinfo_x86 *c)
 {
-	if (mce_dont_init)
-		return 0;
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
@@ -191,77 +172,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 
 /*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
-	struct mce m;
-	int i;
-
-	mce_setup(&m);
-
-	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
-	for (i = 0; i < banks; i++) {
-		if (!bank[i] || !test_bit(i, *b))
-			continue;
-
-		m.misc = 0;
-		m.addr = 0;
-		m.bank = i;
-		m.tsc = 0;
-
-		barrier();
-		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
-		if (!(m.status & MCI_STATUS_VAL))
-			continue;
-
-		/*
-		 * Uncorrected events are handled by the exception handler
-		 * when it is enabled. But when the exception is disabled log
-		 * everything.
-		 *
-		 * TBD do the same check for MCI_STATUS_EN here?
-		 */
-		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
-			continue;
-
-		if (m.status & MCI_STATUS_MISCV)
-			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
-		if (m.status & MCI_STATUS_ADDRV)
-			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
-
-		if (!(flags & MCP_TIMESTAMP))
-			m.tsc = 0;
-		/*
-		 * Don't get the IP here because it's unlikely to
-		 * have anything to do with the actual error location.
-		 */
-
-		mce_log(&m);
-		add_taint(TAINT_MACHINE_CHECK);
-
-		/*
-		 * Clear state for this bank.
-		 */
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-	}
-
-	/*
-	 * Don't clear MCG_STATUS here because it's only defined for
-	 * exceptions.
-	 */
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
+ * The actual machine check handler
  */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -279,18 +190,17 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	 * error.
 	 */
 	int kill_it = 0;
-	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 
 	atomic_inc(&mce_entry);
 
-	if (notify_die(DIE_NMI, "machine check", regs, error_code,
+	if ((regs
+	     && notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-		goto out2;
-	if (!banks)
+	    || !banks)
 		goto out2;
 
-	mce_setup(&m);
-
+	memset(&m, 0, sizeof(struct mce));
+	m.cpu = smp_processor_id();
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -300,32 +210,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		__clear_bit(i, toclear);
-		if (!bank[i])
+		if (i < NR_SYSFS_BANKS && !bank[i])
 			continue;
 
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
+		m.tsc = 0;
 
 		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
-		/*
-		 * Non uncorrected errors are handled by machine_check_poll
-		 * Leave them alone.
-		 */
-		if ((m.status & MCI_STATUS_UC) == 0)
-			continue;
-
-		/*
-		 * Set taint even when machine check was not enabled.
-		 */
-		add_taint(TAINT_MACHINE_CHECK);
-
-		__set_bit(i, toclear);
-
 		if (m.status & MCI_STATUS_EN) {
 			/* if PCC was set, there's no way out */
 			no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -339,12 +235,6 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 					no_way_out = 1;
 				kill_it = 1;
 			}
-		} else {
-			/*
-			 * Machine check event was not enabled. Clear, but
-			 * ignore.
-			 */
-			continue;
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -353,7 +243,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		mce_log(&m);
+		if (error_code >= 0)
+			rdtscll(m.tsc);
+		if (error_code != -2)
+			mce_log(&m);
 
 		/* Did this bank cause the exception? */
 		/* Assume that the bank with uncorrectable errors did it,
@@ -362,8 +255,14 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			panicm = m;
 			panicm_found = 1;
 		}
+
+		add_taint(TAINT_MACHINE_CHECK);
 	}
 
+	/* Never do anything final in the polling timer */
+	if (!regs)
+		goto out;
+
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
 	if (!panicm_found)
@@ -410,11 +309,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
+ out:
 	/* the last thing we do is clear state */
-	for (i = 0; i < banks; i++) {
-		if (test_bit(i, toclear))
-			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-	}
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
@@ -434,13 +332,15 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  * and historically has been the register value of the
  * MSR_IA32_THERMAL_STATUS (Intel) msr.
  */
-void mce_log_therm_throt_event(__u64 status)
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 {
 	struct mce m;
 
-	mce_setup(&m);
+	memset(&m, 0, sizeof(m));
+	m.cpu = cpu;
 	m.bank = MCE_THERMAL_BANK;
 	m.status = status;
+	rdtscll(m.tsc);
 	mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
@@ -453,18 +353,18 @@ void mce_log_therm_throt_event(__u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(unsigned long);
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static void mcheck_timer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 
-static void mcheck_timer(unsigned long data)
+static void mcheck_check_cpu(void *info)
 {
-	struct timer_list *t = &per_cpu(mce_timer, data);
-
-	WARN_ON(smp_processor_id() != data);
-
 	if (mce_available(&current_cpu_data))
-		machine_check_poll(MCP_TIMESTAMP,
-				&__get_cpu_var(mce_poll_banks));
+		do_machine_check(NULL, 0);
+}
+
+static void mcheck_timer(struct work_struct *work)
+{
+	on_each_cpu(mcheck_check_cpu, NULL, 1);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -477,41 +377,31 @@ static void mcheck_timer(unsigned long data)
 				(int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	t->expires = jiffies + next_interval;
-	add_timer(t);
-}
-
-static void mce_do_trigger(struct work_struct *work)
-{
-	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
+	schedule_delayed_work(&mcheck_work, next_interval);
 }
 
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
 /*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
+ * This is only called from process context.  This is where we do
+ * anything we need to alert userspace about new MCEs.  This is called
+ * directly from the poller and also from entry.S and idle, thanks to
+ * TIF_MCE_NOTIFY.
  */
 int mce_notify_user(void)
 {
-	/* Not more than two messages every minute */
-	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
 	clear_thread_flag(TIF_MCE_NOTIFY);
 	if (test_and_clear_bit(0, &notify_user)) {
-		wake_up_interruptible(&mce_wait);
+		static unsigned long last_print;
+		unsigned long now = jiffies;
 
-		/*
-		 * There is no risk of missing notifications because
-		 * work_pending is always cleared before the function is
-		 * executed.
-		 */
-		if (trigger[0] && !work_pending(&mce_trigger_work))
-			schedule_work(&mce_trigger_work);
+		wake_up_interruptible(&mce_wait);
+		if (trigger[0])
+			call_usermodehelper(trigger, trigger_argv, NULL,
+						UMH_NO_WAIT);
 
-		if (__ratelimit(&ratelimit))
+		if (time_after_eq(now, last_print + (check_interval*HZ))) {
+			last_print = now;
 			printk(KERN_INFO "Machine check events logged\n");
+		}
 
 		return 1;
 	}
@@ -535,78 +425,63 @@ static struct notifier_block mce_idle_notifier = {
 
 static __init int periodic_mcheck_init(void)
 {
-       idle_notifier_register(&mce_idle_notifier);
-       return 0;
+	next_interval = check_interval * HZ;
+	if (next_interval)
+		schedule_delayed_work(&mcheck_work,
+				      round_jiffies_relative(next_interval));
+	idle_notifier_register(&mce_idle_notifier);
+	return 0;
 }
 __initcall(periodic_mcheck_init);
 
+
 /*
  * Initialize Machine Checks for a CPU.
  */
-static int mce_cap_init(void)
+static void mce_init(void *dummy)
 {
 	u64 cap;
-	unsigned b;
+	int i;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	b = cap & 0xff;
-	if (b > MAX_NR_BANKS) {
-		printk(KERN_WARNING
-		       "MCE: Using only %u machine check banks out of %u\n",
-			MAX_NR_BANKS, b);
-		b = MAX_NR_BANKS;
+	banks = cap & 0xff;
+	if (banks > MCE_EXTENDED_BANK) {
+		banks = MCE_EXTENDED_BANK;
+		printk(KERN_INFO "MCE: warning: using only %d banks\n",
+		       MCE_EXTENDED_BANK);
 	}
-
-	/* Don't support asymmetric configurations today */
-	WARN_ON(banks != 0 && b != banks);
-	banks = b;
-	if (!bank) {
-		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
-		if (!bank)
-			return -ENOMEM;
-		memset(bank, 0xff, banks * sizeof(u64));
-	}
-
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
-	return 0;
-}
-
-static void mce_init(void *dummy)
-{
-	u64 cap;
-	int i;
-	mce_banks_t all_banks;
-
-	/*
-	 * Log the machine checks left over from the previous reset.
-	 */
-	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC, &all_banks);
+	/* Log the machine checks left over from the previous reset.
+	   This also clears all registers */
+	do_machine_check(NULL, mce_bootlog ? -1 : -2);
 
 	set_in_cr4(X86_CR4_MCE);
 
-	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		if (i < NR_SYSFS_BANKS)
+			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		else
+			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
+
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
 
 /* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if (c->x86 == 15 && banks > 4)
+		if(c->x86 == 15)
 			/* disable GART TBL walk error reporting, which trips off
 			   incorrectly with the IOMMU & 3ware & Cerberus. */
-			clear_bit(10, (unsigned long *)&bank[4]);
+			clear_bit(10, &bank[4]);
 		if(c->x86 <= 17 && mce_bootlog < 0)
 			/* Lots of broken BIOS around that don't clear them
 			   by default and leave crap in there. Don't log. */
@@ -629,38 +504,20 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
-static void mce_init_timer(void)
-{
-	struct timer_list *t = &__get_cpu_var(mce_timer);
-
-	/* data race harmless because everyone sets to the same value */
-	if (!next_interval)
-		next_interval = check_interval * HZ;
-	if (!next_interval)
-		return;
-	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies_relative(jiffies + next_interval);
-	add_timer(t);
-}
-
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off.
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-	if (!mce_available(c))
-		return;
+	mce_cpu_quirks(c);
 
-	if (mce_cap_init() < 0) {
-		mce_dont_init = 1;
+	if (mce_dont_init ||
+	    !mce_available(c))
 		return;
-	}
-	mce_cpu_quirks(c);
 
 	mce_init(NULL);
 	mce_cpu_features(c);
-	mce_init_timer();
 }
 
 /*
@@ -716,7 +573,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
 	unsigned long *cpu_tsc;
 	static DEFINE_MUTEX(mce_read_mutex);
-	unsigned prev, next;
+	unsigned next;
 	char __user *buf = ubuf;
 	int i, err;
 
@@ -735,32 +592,25 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	}
 
 	err = 0;
-	prev = 0;
-	do {
-		for (i = prev; i < next; i++) {
-			unsigned long start = jiffies;
-
-			while (!mcelog.entry[i].finished) {
-				if (time_after_eq(jiffies, start + 2)) {
-					memset(mcelog.entry + i, 0,
-					       sizeof(struct mce));
-					goto timeout;
-				}
-				cpu_relax();
+	for (i = 0; i < next; i++) {
+		unsigned long start = jiffies;
+
+		while (!mcelog.entry[i].finished) {
+			if (time_after_eq(jiffies, start + 2)) {
+				memset(mcelog.entry + i,0, sizeof(struct mce));
+				goto timeout;
 			}
-			smp_rmb();
-			err |= copy_to_user(buf, mcelog.entry + i,
-					    sizeof(struct mce));
-			buf += sizeof(struct mce);
-timeout:
-			;
+			cpu_relax();
 		}
+		smp_rmb();
+		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
+		buf += sizeof(struct mce);
+ timeout:
+		;
+	}
 
-		memset(mcelog.entry + prev, 0,
-		       (next - prev) * sizeof(struct mce));
-		prev = next;
-		next = cmpxchg(&mcelog.next, prev, 0);
-	} while (next != prev);
+	memset(mcelog.entry, 0, next * sizeof(struct mce));
+	mcelog.next = 0;
 
 	synchronize_sched();
 
@@ -830,6 +680,20 @@ static struct miscdevice mce_log_device = {
 	&mce_chrdev_ops,
 };
 
+static unsigned long old_cr4 __initdata;
+
+void __init stop_mce(void)
+{
+	old_cr4 = read_cr4();
+	clear_in_cr4(X86_CR4_MCE);
+}
+
+void __init restart_mce(void)
+{
+	if (old_cr4 & X86_CR4_MCE)
+		set_in_cr4(X86_CR4_MCE);
+}
+
 /*
  * Old style boot options parsing. Only for compatibility.
  */
@@ -839,7 +703,8 @@ static int __init mcheck_disable(char *str)
 	return 1;
 }
 
-/* mce=off disables machine check.
+/* mce=off disables machine check. Note you can re-enable it later
+   using sysfs.
    mce=TOLERANCELEVEL (number, see above)
    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
    mce=nobootlog Don't log MCEs from before booting. */
@@ -863,29 +728,6 @@ __setup("mce=", mcheck_enable);
  * Sysfs support
  */
 
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static int mce_disable(void)
-{
-	int i;
-
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
-	return 0;
-}
-
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
-{
-	return mce_disable();
-}
-
-static int mce_shutdown(struct sys_device *dev)
-{
-	return mce_disable();
-}
-
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
    Only one CPU is active at this time, the others get readded later using
    CPU hotplug. */
@@ -896,24 +738,20 @@ static int mce_resume(struct sys_device *dev)
 	return 0;
 }
 
-static void mce_cpu_restart(void *data)
-{
-	del_timer_sync(&__get_cpu_var(mce_timer));
-	if (mce_available(&current_cpu_data))
-		mce_init(NULL);
-	mce_init_timer();
-}
-
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
+	if (next_interval)
+		cancel_delayed_work(&mcheck_work);
+	/* Timer race is harmless here */
+	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
-	on_each_cpu(mce_cpu_restart, NULL, 1);
+	if (next_interval)
+		schedule_delayed_work(&mcheck_work,
+				      round_jiffies_relative(next_interval));
 }
 
 static struct sysdev_class mce_sysclass = {
-	.suspend = mce_suspend,
-	.shutdown = mce_shutdown,
 	.resume = mce_resume,
 	.name = "machinecheck",
 };
@@ -940,26 +778,16 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-static struct sysdev_attribute *bank_attrs;
-
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
-			 char *buf)
-{
-	u64 b = bank[attr - bank_attrs];
-	return sprintf(buf, "%llx\n", b);
-}
-
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
-			const char *buf, size_t siz)
-{
-	char *end;
-	u64 new = simple_strtoull(buf, &end, 0);
-	if (end == buf)
-		return -EINVAL;
-	bank[attr - bank_attrs] = new;
-	mce_restart();
-	return end-buf;
-}
+/*
+ * TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
+ACCESSOR(bank0ctl,bank[0],mce_restart())
+ACCESSOR(bank1ctl,bank[1],mce_restart())
+ACCESSOR(bank2ctl,bank[2],mce_restart())
+ACCESSOR(bank3ctl,bank[3],mce_restart())
+ACCESSOR(bank4ctl,bank[4],mce_restart())
+ACCESSOR(bank5ctl,bank[5],mce_restart())
 
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 				char *buf)
@@ -986,6 +814,8 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
+	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
@@ -1015,22 +845,11 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 		if (err)
 			goto error;
 	}
-	for (i = 0; i < banks; i++) {
-		err = sysdev_create_file(&per_cpu(device_mce, cpu),
-					&bank_attrs[i]);
-		if (err)
-			goto error2;
-	}
 	cpu_set(cpu, mce_device_initialized);
 
 	return 0;
-error2:
-	while (--i >= 0) {
-		sysdev_remove_file(&per_cpu(device_mce, cpu),
-					&bank_attrs[i]);
-	}
 error:
-	while (--i >= 0) {
+	while (i--) {
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 				   mce_attributes[i]);
 	}
@@ -1049,46 +868,15 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	for (i = 0; mce_attributes[i]; i++)
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 			mce_attributes[i]);
-	for (i = 0; i < banks; i++)
-		sysdev_remove_file(&per_cpu(device_mce, cpu),
-			&bank_attrs[i]);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
 	cpu_clear(cpu, mce_device_initialized);
 }
 
-/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
-{
-	int i;
-	unsigned long action = *(unsigned long *)h;
-
-	if (!mce_available(&current_cpu_data))
-		return;
-	if (!(action & CPU_TASKS_FROZEN))
-		cmci_clear();
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
-}
-
-static void mce_reenable_cpu(void *h)
-{
-	int i;
-	unsigned long action = *(unsigned long *)h;
-
-	if (!mce_available(&current_cpu_data))
-		return;
-	if (!(action & CPU_TASKS_FROZEN))
-		cmci_reenable();
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
-}
-
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action) {
 	case CPU_ONLINE:
@@ -1103,21 +891,6 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 			threshold_cpu_callback(action, cpu);
 		mce_remove_device(cpu);
 		break;
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		del_timer_sync(t);
-		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
-		break;
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies_relative(jiffies + next_interval);
-		add_timer_on(t, cpu);
-		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-		break;
-	case CPU_POST_DEAD:
-		/* intentionally ignoring frozen here */
-		cmci_rediscover(cpu);
-		break;
 	}
 	return NOTIFY_OK;
 }
@@ -1126,34 +899,6 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
-static __init int mce_init_banks(void)
-{
-	int i;
-
-	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
-				GFP_KERNEL);
-	if (!bank_attrs)
-		return -ENOMEM;
-
-	for (i = 0; i < banks; i++) {
-		struct sysdev_attribute *a = &bank_attrs[i];
-		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
-		if (!a->attr.name)
-			goto nomem;
-		a->attr.mode = 0644;
-		a->show = show_bank;
-		a->store = set_bank;
-	}
-	return 0;
-
-nomem:
-	while (--i >= 0)
-		kfree(bank_attrs[i].attr.name);
-	kfree(bank_attrs);
-	bank_attrs = NULL;
-	return -ENOMEM;
-}
-
 static __init int mce_init_device(void)
 {
 	int err;
@@ -1161,11 +906,6 @@ static __init int mce_init_device(void)
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
-
-	err = mce_init_banks();
-	if (err)
-		return err;
-
 	err = sysdev_class_register(&mce_sysclass);
 	if (err)
 		return err;
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index c5a32f92d07e..9817506dd469 100644
--- a/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,8 +79,6 @@ static unsigned char shared_bank[NR_BANKS] = {
 
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
-static void amd_threshold_interrupt(void);
-
 /*
  * CPU Initialization
  */
@@ -176,8 +174,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			tr.reset = 0;
 			tr.old_limit = 0;
 			threshold_restart_bank(&tr);
-
-			mce_threshold_vector = amd_threshold_interrupt;
 		}
 	}
 }
@@ -191,13 +187,19 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
-static void amd_threshold_interrupt(void)
+asmlinkage void mce_threshold_interrupt(void)
 {
 	unsigned int bank, block;
 	struct mce m;
 	u32 low = 0, high = 0, address = 0;
 
-	mce_setup(&m);
+	ack_APIC_irq();
+	exit_idle();
+	irq_enter();
+
+	memset(&m, 0, sizeof(m));
+	rdtscll(m.tsc);
+	m.cpu = smp_processor_id();
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -231,8 +233,7 @@ static void amd_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			machine_check_poll(MCP_TIMESTAMP,
-					&__get_cpu_var(mce_poll_banks));
+			do_machine_check(NULL, 0);
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
@@ -242,10 +243,13 @@ static void amd_threshold_interrupt(void)
 				       + bank * NR_BLOCKS
 				       + block;
 				mce_log(&m);
-				return;
+				goto out;
 			}
 		}
 	}
+out:
+	inc_irq_stat(irq_threshold_count);
+	irq_exit();
 }
 
 /*
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aaa7d9730938..aa5e287c98e0 100644
--- a/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,8 +1,6 @@
 /*
  * Intel specific MCE features.
  * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
- * Copyright (C) 2008, 2009 Intel Corporation
- * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -15,7 +13,6 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
-#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -28,7 +25,7 @@ asmlinkage void smp_thermal_interrupt(void)
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 	if (therm_throt_process(msr_val & 1))
-		mce_log_therm_throt_event(msr_val);
+		mce_log_therm_throt_event(smp_processor_id(), msr_val);
 
 	inc_irq_stat(irq_thermal_count);
 	irq_exit();
@@ -88,209 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
-/*
- * Support for Intel Correct Machine Check Interrupts. This allows
- * the CPU to raise an interrupt when a corrected machine check happened.
- * Normally we pick those up using a regular polling timer.
- * Also supports reliable discovery of shared banks.
- */
-
-static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
-
-/*
- * cmci_discover_lock protects against parallel discovery attempts
- * which could race against each other.
- */
-static DEFINE_SPINLOCK(cmci_discover_lock);
-
-#define CMCI_THRESHOLD 1
-
-static int cmci_supported(int *banks)
-{
-	u64 cap;
-
-	/*
-	 * Vendor check is not strictly needed, but the initial
-	 * initialization is vendor keyed and this
-	 * makes sure none of the backdoors are entered otherwise.
-	 */
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-		return 0;
-	if (!cpu_has_apic || lapic_get_maxlvt() < 6)
-		return 0;
-	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
-	return !!(cap & MCG_CMCI_P);
-}
-
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
-	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
-	mce_notify_user();
-}
-
-static void print_update(char *type, int *hdr, int num)
-{
-	if (*hdr == 0)
-		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
-	*hdr = 1;
-	printk(KERN_CONT " %s:%d", type, num);
-}
-
-/*
- * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
- * on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
- */
-static void cmci_discover(int banks, int boot)
-{
-	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
-	int hdr = 0;
-	int i;
-
-	spin_lock(&cmci_discover_lock);
-	for (i = 0; i < banks; i++) {
-		u64 val;
-
-		if (test_bit(i, owned))
-			continue;
-
-		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
-
-		/* Already owned by someone else? */
-		if (val & CMCI_EN) {
-			if (test_and_clear_bit(i, owned) || boot)
-				print_update("SHD", &hdr, i);
-			__clear_bit(i, __get_cpu_var(mce_poll_banks));
-			continue;
-		}
-
-		val |= CMCI_EN | CMCI_THRESHOLD;
-		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
-		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
-
-		/* Did the enable bit stick? -- the bank supports CMCI */
-		if (val & CMCI_EN) {
-			if (!test_and_set_bit(i, owned) || boot)
-				print_update("CMCI", &hdr, i);
-			__clear_bit(i, __get_cpu_var(mce_poll_banks));
-		} else {
-			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
-		}
-	}
-	spin_unlock(&cmci_discover_lock);
-	if (hdr)
-		printk(KERN_CONT "\n");
-}
-
-/*
- * Just in case we missed an event during initialization check
- * all the CMCI owned banks.
- */
-void cmci_recheck(void)
-{
-	unsigned long flags;
-	int banks;
-
-	if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
-		return;
-	local_irq_save(flags);
-	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
-	local_irq_restore(flags);
-}
-
-/*
- * Disable CMCI on this CPU for all banks it owns when it goes down.
- * This allows other CPUs to claim the banks on rediscovery.
- */
-void cmci_clear(void)
-{
-	int i;
-	int banks;
-	u64 val;
-
-	if (!cmci_supported(&banks))
-		return;
-	spin_lock(&cmci_discover_lock);
-	for (i = 0; i < banks; i++) {
-		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
-			continue;
-		/* Disable CMCI */
-		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
-		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
-		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
-		__clear_bit(i, __get_cpu_var(mce_banks_owned));
-	}
-	spin_unlock(&cmci_discover_lock);
-}
-
-/*
- * After a CPU went down cycle through all the others and rediscover
- * Must run in process context.
- */
-void cmci_rediscover(int dying)
-{
-	int banks;
-	int cpu;
-	cpumask_var_t old;
-
-	if (!cmci_supported(&banks))
-		return;
-	if (!alloc_cpumask_var(&old, GFP_KERNEL))
-		return;
-	cpumask_copy(old, &current->cpus_allowed);
-
-	for_each_online_cpu (cpu) {
-		if (cpu == dying)
-			continue;
-		if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
-			continue;
-		/* Recheck banks in case CPUs don't all have the same */
-		if (cmci_supported(&banks))
-			cmci_discover(banks, 0);
-	}
-
-	set_cpus_allowed_ptr(current, old);
-	free_cpumask_var(old);
-}
-
-/*
- * Reenable CMCI on this CPU in case a CPU down failed.
- */
-void cmci_reenable(void)
-{
-	int banks;
-	if (cmci_supported(&banks))
-		cmci_discover(banks, 0);
-}
-
-static __cpuinit void intel_init_cmci(void)
-{
-	int banks;
-
-	if (!cmci_supported(&banks))
-		return;
-
-	mce_threshold_vector = intel_threshold_interrupt;
-	cmci_discover(banks, 1);
-	/*
-	 * For CPU #0 this runs with still disabled APIC, but that's
-	 * ok because only the vector is set up. We still do another
-	 * check for the banks later for CPU #0 just to make sure
-	 * to not miss any events.
-	 */
-	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
-	cmci_recheck();
-}
-
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
-	intel_init_cmci();
 }
diff --git a/trunk/arch/x86/kernel/cpu/mcheck/threshold.c b/trunk/arch/x86/kernel/cpu/mcheck/threshold.c
deleted file mode 100644
index 23ee9e730f78..000000000000
--- a/trunk/arch/x86/kernel/cpu/mcheck/threshold.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Common corrected MCE threshold handler code:
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-
-#include <asm/irq_vectors.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-
-static void default_threshold_interrupt(void)
-{
-	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
-			 THRESHOLD_APIC_VECTOR);
-}
-
-void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-
-asmlinkage void mce_threshold_interrupt(void)
-{
-	exit_idle();
-	irq_enter();
-	inc_irq_stat(irq_threshold_count);
-	mce_threshold_vector();
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
-}
diff --git a/trunk/arch/x86/kernel/efi.c b/trunk/arch/x86/kernel/efi.c
index 1736acc4d7aa..b205272ad394 100644
--- a/trunk/arch/x86/kernel/efi.c
+++ b/trunk/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
 	efi_memory_desc_t *md;
 	efi_status_t status;
 	unsigned long size;
-	u64 end, systab, addr, npages, end_pfn;
+	u64 end, systab, addr, npages;
 	void *p, *va;
 
 	efi.systab = NULL;
@@ -481,10 +481,7 @@ void __init efi_enter_virtual_mode(void)
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		end_pfn = PFN_UP(end);
-		if (end_pfn <= max_low_pfn_mapped
-		    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-			&& end_pfn <= max_pfn_mapped))
+		if (PFN_UP(end) <= max_low_pfn_mapped)
 			va = __va(md->phys_addr);
 		else
 			va = efi_ioremap(md->phys_addr, size);
diff --git a/trunk/arch/x86/kernel/efi_64.c b/trunk/arch/x86/kernel/efi_64.c
index 22c3b7828c50..a4ee29127fdf 100644
--- a/trunk/arch/x86/kernel/efi_64.c
+++ b/trunk/arch/x86/kernel/efi_64.c
@@ -100,11 +100,24 @@ void __init efi_call_phys_epilog(void)
 
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
-	unsigned long last_map_pfn;
+	static unsigned pages_mapped __initdata;
+	unsigned i, pages;
+	unsigned long offset;
 
-	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
-	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
+	pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+
+	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
 		return NULL;
 
-	return (void __iomem *)__va(phys_addr);
+	for (i = 0; i < pages; i++) {
+		__set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
+			     phys_addr, PAGE_KERNEL);
+		phys_addr += PAGE_SIZE;
+		pages_mapped++;
+	}
+
+	return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
+					     (pages_mapped - pages)) + offset;
 }
diff --git a/trunk/arch/x86/kernel/i387.c b/trunk/arch/x86/kernel/i387.c
index f2f8540a7f3d..b0f61f0dcd0a 100644
--- a/trunk/arch/x86/kernel/i387.c
+++ b/trunk/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 		memset(tsk->thread.xstate, 0, xstate_size);
-		finit_task(tsk);
+		finit();
 		set_stopped_child_used_math(tsk);
 		return 0;
 	}
diff --git a/trunk/arch/x86/kernel/irq_32.c b/trunk/arch/x86/kernel/irq_32.c
index 3b09634a5153..9dc6b2b24275 100644
--- a/trunk/arch/x86/kernel/irq_32.c
+++ b/trunk/arch/x86/kernel/irq_32.c
@@ -16,7 +16,6 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
-#include <linux/percpu.h>
 
 #include <asm/apic.h>
 
@@ -56,13 +55,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
 	struct thread_info      tinfo;
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
+};
 
-static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 static void call_on_stack(void *func, void *stack)
 {
@@ -82,7 +81,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	u32 *isp, arg1, arg2;
 
 	curctx = (union irq_ctx *) current_thread_info();
-	irqctx = __get_cpu_var(hardirq_ctx);
+	irqctx = hardirq_ctx[smp_processor_id()];
 
 	/*
 	 * this is where we switch to the IRQ stack. However, if we are
@@ -126,34 +125,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
 
-	if (per_cpu(hardirq_ctx, cpu))
+	if (hardirq_ctx[cpu])
 		return;
 
-	irqctx = &per_cpu(hardirq_stack, cpu);
+	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	per_cpu(hardirq_ctx, cpu) = irqctx;
+	hardirq_ctx[cpu] = irqctx;
 
-	irqctx = &per_cpu(softirq_stack, cpu);
+	irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= 0;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	per_cpu(softirq_ctx, cpu) = irqctx;
+	softirq_ctx[cpu] = irqctx;
 
 	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
+	       cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
 }
 
 void irq_ctx_exit(int cpu)
 {
-	per_cpu(hardirq_ctx, cpu) = NULL;
+	hardirq_ctx[cpu] = NULL;
 }
 
 asmlinkage void do_softirq(void)
@@ -170,7 +169,7 @@ asmlinkage void do_softirq(void)
 
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
-		irqctx = __get_cpu_var(softirq_ctx);
+		irqctx = softirq_ctx[smp_processor_id()];
 		irqctx->tinfo.task = curctx->task;
 		irqctx->tinfo.previous_esp = current_stack_pointer;
 
diff --git a/trunk/arch/x86/kernel/reboot.c b/trunk/arch/x86/kernel/reboot.c
index 2aef36d8aca2..1cc18d439bbb 100644
--- a/trunk/arch/x86/kernel/reboot.c
+++ b/trunk/arch/x86/kernel/reboot.c
@@ -216,14 +216,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
 		},
 	},
-	{	/* Handle problems with rebooting on Dell XPS710 */
-		.callback = set_bios_reboot,
-		.ident = "Dell XPS710",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
-		},
-	},
 	{ }
 };
 
diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c
index b746deb9ebc6..4c54bc0d8ff3 100644
--- a/trunk/arch/x86/kernel/setup.c
+++ b/trunk/arch/x86/kernel/setup.c
@@ -770,9 +770,6 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
-	if (efi_enabled)
-		efi_init();
-
 	dmi_scan_machine();
 
 	dmi_check_system(bad_bios_dmi_table);
@@ -792,6 +789,8 @@ void __init setup_arch(char **cmdline_p)
 	insert_resource(&iomem_resource, &data_resource);
 	insert_resource(&iomem_resource, &bss_resource);
 
+	if (efi_enabled)
+		efi_init();
 
 #ifdef CONFIG_X86_32
 	if (ppro_with_ram_bug()) {
diff --git a/trunk/arch/x86/kernel/setup_percpu.c b/trunk/arch/x86/kernel/setup_percpu.c
index c29f301d3885..d992e6cff730 100644
--- a/trunk/arch/x86/kernel/setup_percpu.c
+++ b/trunk/arch/x86/kernel/setup_percpu.c
@@ -7,7 +7,6 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
-#include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@@ -42,321 +41,6 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
-/**
- * pcpu_need_numa - determine percpu allocation needs to consider NUMA
- *
- * If NUMA is not configured or there is only one NUMA node available,
- * there is no reason to consider NUMA.  This function determines
- * whether percpu allocation should consider NUMA or not.
- *
- * RETURNS:
- * true if NUMA should be considered; otherwise, false.
- */
-static bool __init pcpu_need_numa(void)
-{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-	pg_data_t *last = NULL;
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		int node = early_cpu_to_node(cpu);
-
-		if (node_online(node) && NODE_DATA(node) &&
-		    last && last != NODE_DATA(node))
-			return true;
-
-		last = NODE_DATA(node);
-	}
-#endif
-	return false;
-}
-
-/**
- * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
- * @cpu: cpu to allocate for
- * @size: size allocation in bytes
- * @align: alignment
- *
- * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
- * does the right thing for NUMA regardless of the current
- * configuration.
- *
- * RETURNS:
- * Pointer to the allocated area on success, NULL on failure.
- */
-static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
-					unsigned long align)
-{
-	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-	int node = early_cpu_to_node(cpu);
-	void *ptr;
-
-	if (!node_online(node) || !NODE_DATA(node)) {
-		ptr = __alloc_bootmem_nopanic(size, align, goal);
-		pr_info("cpu %d has no node %d or node-local memory\n",
-			cpu, node);
-		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
-			 cpu, size, __pa(ptr));
-	} else {
-		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
-						   size, align, goal);
-		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
-			 "%016lx\n", cpu, size, node, __pa(ptr));
-	}
-	return ptr;
-#else
-	return __alloc_bootmem_nopanic(size, align, goal);
-#endif
-}
-
-/*
- * Remap allocator
- *
- * This allocator uses PMD page as unit.  A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk.  Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk.  The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
-
-static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpur_size)
-		return NULL;
-
-	return virt_to_page(pcpur_ptrs[cpu] + off);
-}
-
-static ssize_t __init setup_pcpu_remap(size_t static_size)
-{
-	static struct vm_struct vm;
-	pg_data_t *last;
-	size_t ptrs_size;
-	unsigned int cpu;
-	ssize_t ret;
-
-	/*
-	 * If large page isn't supported, there's no benefit in doing
-	 * this.  Also, on non-NUMA, embedding is better.
-	 */
-	if (!cpu_has_pse || pcpu_need_numa())
-		return -EINVAL;
-
-	last = NULL;
-	for_each_possible_cpu(cpu) {
-		int node = early_cpu_to_node(cpu);
-
-		if (node_online(node) && NODE_DATA(node) &&
-		    last && last != NODE_DATA(node))
-			goto proceed;
-
-		last = NODE_DATA(node);
-	}
-	return -EINVAL;
-
-proceed:
-	/*
-	 * Currently supports only single page.  Supporting multiple
-	 * pages won't be too difficult if it ever becomes necessary.
-	 */
-	pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
-	if (pcpur_size > PMD_SIZE) {
-		pr_warning("PERCPU: static data is larger than large page, "
-			   "can't use large page\n");
-		return -EINVAL;
-	}
-
-	/* allocate pointer array and alloc large pages */
-	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
-	pcpur_ptrs = alloc_bootmem(ptrs_size);
-
-	for_each_possible_cpu(cpu) {
-		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
-		if (!pcpur_ptrs[cpu])
-			goto enomem;
-
-		/*
-		 * Only use pcpur_size bytes and give back the rest.
-		 *
-		 * Ingo: The 2MB up-rounding bootmem is needed to make
-		 * sure the partial 2MB page is still fully RAM - it's
-		 * not well-specified to have a PAT-incompatible area
-		 * (unmapped RAM, device memory, etc.) in that hole.
-		 */
-		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
-			     PMD_SIZE - pcpur_size);
-
-		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
-	}
-
-	/* allocate address and map */
-	vm.flags = VM_ALLOC;
-	vm.size = num_possible_cpus() * PMD_SIZE;
-	vm_area_register_early(&vm, PMD_SIZE);
-
-	for_each_possible_cpu(cpu) {
-		pmd_t *pmd;
-
-		pmd = populate_extra_pmd((unsigned long)vm.addr
-					 + cpu * PMD_SIZE);
-		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
-				     PAGE_KERNEL_LARGE));
-	}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", vm.addr, static_size);
-
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
-				     pcpur_size - static_size, vm.addr, NULL);
-	goto out_free_ar;
-
-enomem:
-	for_each_possible_cpu(cpu)
-		if (pcpur_ptrs[cpu])
-			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
-	ret = -ENOMEM;
-out_free_ar:
-	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
-	return ret;
-}
-#else
-static ssize_t __init setup_pcpu_remap(size_t static_size)
-{
-	return -EINVAL;
-}
-#endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
- * bootmem allocator and used as-is without being mapped into vmalloc
- * area.  This enables the first chunk to piggy back on the linear
- * physical PMD mapping and doesn't add any additional pressure to
- * TLB.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
-			    + ((size_t)pageno << PAGE_SHIFT));
-}
-
-static ssize_t __init setup_pcpu_embed(size_t static_size)
-{
-	unsigned int cpu;
-
-	/*
-	 * If large page isn't supported, there's no benefit in doing
-	 * this.  Also, embedding allocation doesn't play well with
-	 * NUMA.
-	 */
-	if (!cpu_has_pse || pcpu_need_numa())
-		return -EINVAL;
-
-	/* allocate and copy */
-	pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
-	pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
-	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
-				       PAGE_SIZE);
-	if (!pcpue_ptr)
-		return -ENOMEM;
-
-	for_each_possible_cpu(cpu)
-		memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
-		       static_size);
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
-
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      pcpue_unit_size,
-				      pcpue_unit_size - static_size, pcpue_ptr,
-				      NULL);
-}
-
-/*
- * 4k page allocator
- *
- * This is the basic allocator.  Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
-	if (pageno < pcpu4k_nr_static_pages)
-		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
-	return NULL;
-}
-
-static void __init pcpu4k_populate_pte(unsigned long addr)
-{
-	populate_extra_pte(addr);
-}
-
-static ssize_t __init setup_pcpu_4k(size_t static_size)
-{
-	size_t pages_size;
-	unsigned int cpu;
-	int i, j;
-	ssize_t ret;
-
-	pcpu4k_nr_static_pages = PFN_UP(static_size);
-
-	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
-			       * sizeof(pcpu4k_pages[0]));
-	pcpu4k_pages = alloc_bootmem(pages_size);
-
-	/* allocate and copy */
-	j = 0;
-	for_each_possible_cpu(cpu)
-		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
-			void *ptr;
-
-			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
-			if (!ptr)
-				goto enomem;
-
-			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
-			pcpu4k_pages[j++] = virt_to_page(ptr);
-		}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
-		pcpu4k_nr_static_pages, static_size);
-
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
-				     pcpu4k_populate_pte);
-	goto out_free_ar;
-
-enomem:
-	while (--j >= 0)
-		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
-	ret = -ENOMEM;
-out_free_ar:
-	free_bootmem(__pa(pcpu4k_pages), pages_size);
-	return ret;
-}
-
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -377,35 +61,38 @@ static inline void setup_percpu_segment(int cpu)
  */
 void __init setup_per_cpu_areas(void)
 {
-	size_t static_size = __per_cpu_end - __per_cpu_start;
-	unsigned int cpu;
-	unsigned long delta;
-	size_t pcpu_unit_size;
-	ssize_t ret;
+	ssize_t size;
+	char *ptr;
+	int cpu;
+
+	/* Copy section for each CPU (we discard the original) */
+	size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
-	/*
-	 * Allocate percpu area.  If PSE is supported, try to make use
-	 * of large page mappings.  Please read comments on top of
-	 * each allocator for details.
-	 */
-	ret = setup_pcpu_remap(static_size);
-	if (ret < 0)
-		ret = setup_pcpu_embed(static_size);
-	if (ret < 0)
-		ret = setup_pcpu_4k(static_size);
-	if (ret < 0)
-		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
-		      static_size, ret);
-
-	pcpu_unit_size = ret;
+	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
 
-	/* alrighty, percpu areas up and running */
-	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+		ptr = alloc_bootmem_pages(size);
+#else
+		int node = early_cpu_to_node(cpu);
+		if (!node_online(node) || !NODE_DATA(node)) {
+			ptr = alloc_bootmem_pages(size);
+			pr_info("cpu %d has no node %d or node-local memory\n",
+				cpu, node);
+			pr_debug("per cpu data for cpu%d at %016lx\n",
+				 cpu, __pa(ptr));
+		} else {
+			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
+				cpu, node, __pa(ptr));
+		}
+#endif
+
+		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
@@ -438,6 +125,8 @@ void __init setup_per_cpu_areas(void)
 		 */
 		if (cpu == boot_cpu_id)
 			switch_to_new_gdt(cpu);
+
+		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
 
 	/* indicate the early static arrays will soon be gone */
diff --git a/trunk/arch/x86/math-emu/fpu_aux.c b/trunk/arch/x86/math-emu/fpu_aux.c
index aa0987088774..491e737ce547 100644
--- a/trunk/arch/x86/math-emu/fpu_aux.c
+++ b/trunk/arch/x86/math-emu/fpu_aux.c
@@ -30,29 +30,20 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_task(struct task_struct *tsk)
+void finit(void)
 {
-	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
-	struct address *oaddr, *iaddr;
-	soft->cwd = 0x037f;
-	soft->swd = 0;
-	soft->ftop = 0;	/* We don't keep top in the status word internally. */
-	soft->twd = 0xffff;
+	control_word = 0x037f;
+	partial_status = 0;
+	top = 0;		/* We don't keep top in the status word internally. */
+	fpu_tag_word = 0xffff;
 	/* The behaviour is different from that detailed in
 	   Section 15.1.6 of the Intel manual */
-	oaddr = (struct address *)&soft->foo;
-	oaddr->offset = 0;
-	oaddr->selector = 0;
-	iaddr = (struct address *)&soft->fip;
-	iaddr->offset = 0;
-	iaddr->selector = 0;
-	iaddr->opcode = 0;
-	soft->no_update = 1;
-}
-
-void finit(void)
-{
-	finit_task(current);
+	operand_address.offset = 0;
+	operand_address.selector = 0;
+	instruction_address.offset = 0;
+	instruction_address.selector = 0;
+	instruction_address.opcode = 0;
+	no_ip_update = 1;
 }
 
 /*
diff --git a/trunk/arch/x86/mm/init.c b/trunk/arch/x86/mm/init.c
index ce6a722587d8..f89df52683c5 100644
--- a/trunk/arch/x86/mm/init.c
+++ b/trunk/arch/x86/mm/init.c
@@ -1,9 +1,33 @@
+#include <linux/ioport.h>
 #include <linux/swap.h>
+
 #include <asm/cacheflush.h>
 #include <asm/page.h>
+#include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/system.h>
 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
 	unsigned long addr = begin;
diff --git a/trunk/arch/x86/mm/init_32.c b/trunk/arch/x86/mm/init_32.c
index 47df0e1bbeb9..917c4e60c767 100644
--- a/trunk/arch/x86/mm/init_32.c
+++ b/trunk/arch/x86/mm/init_32.c
@@ -135,23 +135,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 
-pmd_t * __init populate_extra_pmd(unsigned long vaddr)
-{
-	int pgd_idx = pgd_index(vaddr);
-	int pmd_idx = pmd_index(vaddr);
-
-	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
-}
-
-pte_t * __init populate_extra_pte(unsigned long vaddr)
-{
-	int pte_idx = pte_index(vaddr);
-	pmd_t *pmd;
-
-	pmd = populate_extra_pmd(vaddr);
-	return one_page_table_init(pmd) + pte_idx;
-}
-
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 					   unsigned long vaddr, pte_t *lastpte)
 {
@@ -371,27 +354,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
 	}
 }
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
-	return 0;
-}
-
 pte_t *kmap_pte;
 pgprot_t kmap_prot;
 
diff --git a/trunk/arch/x86/mm/init_64.c b/trunk/arch/x86/mm/init_64.c
index 07f44d491df1..074435e79824 100644
--- a/trunk/arch/x86/mm/init_64.c
+++ b/trunk/arch/x86/mm/init_64.c
@@ -168,51 +168,34 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }
 
-static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+void
+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 {
-	if (pgd_none(*pgd)) {
-		pud_t *pud = (pud_t *)spp_getpage();
-		pgd_populate(&init_mm, pgd, pud);
-		if (pud != pud_offset(pgd, 0))
-			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
-			       pud, pud_offset(pgd, 0));
-	}
-	return pud_offset(pgd, vaddr);
-}
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
 
-static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
-{
+	pud = pud_page + pud_index(vaddr);
 	if (pud_none(*pud)) {
-		pmd_t *pmd = (pmd_t *) spp_getpage();
+		pmd = (pmd_t *) spp_getpage();
 		pud_populate(&init_mm, pud, pmd);
-		if (pmd != pmd_offset(pud, 0))
+		if (pmd != pmd_offset(pud, 0)) {
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-			       pmd, pmd_offset(pud, 0));
+				pmd, pmd_offset(pud, 0));
+			return;
+		}
 	}
-	return pmd_offset(pud, vaddr);
-}
-
-static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
-{
+	pmd = pmd_offset(pud, vaddr);
 	if (pmd_none(*pmd)) {
-		pte_t *pte = (pte_t *) spp_getpage();
+		pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
-		if (pte != pte_offset_kernel(pmd, 0))
+		if (pte != pte_offset_kernel(pmd, 0)) {
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
+			return;
+		}
 	}
-	return pte_offset_kernel(pmd, vaddr);
-}
-
-void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pud = pud_page + pud_index(vaddr);
-	pmd = fill_pmd(pud, vaddr);
-	pte = fill_pte(pmd, vaddr);
 
+	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, new_pte);
 
 	/*
@@ -222,7 +205,8 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__flush_tlb_one(vaddr);
 }
 
-void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void
+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	pud_t *pud_page;
@@ -239,24 +223,6 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 
-pmd_t * __init populate_extra_pmd(unsigned long vaddr)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-
-	pgd = pgd_offset_k(vaddr);
-	pud = fill_pud(pgd, vaddr);
-	return fill_pmd(pud, vaddr);
-}
-
-pte_t * __init populate_extra_pte(unsigned long vaddr)
-{
-	pmd_t *pmd;
-
-	pmd = populate_extra_pmd(vaddr);
-	return fill_pte(pmd, vaddr);
-}
-
 /*
  * Create large page table mappings for a range of physical addresses.
  */
@@ -910,28 +876,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
-	return 0;
-}
-
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
diff --git a/trunk/arch/x86/mm/kmmio.c b/trunk/arch/x86/mm/kmmio.c
index 9f205030d9aa..93d82038af4b 100644
--- a/trunk/arch/x86/mm/kmmio.c
+++ b/trunk/arch/x86/mm/kmmio.c
@@ -32,14 +32,11 @@ struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
 	unsigned long page; /* location of the fault page */
-	bool old_presence; /* page presence prior to arming */
-	bool armed;
 
 	/*
 	 * Number of times this page has been registered as a part
 	 * of a probe. If zero, page is disarmed and this may be freed.
-	 * Used only by writers (RCU) and post_kmmio_handler().
-	 * Protected by kmmio_lock, when linked into kmmio_page_table.
+	 * Used only by writers (RCU).
 	 */
 	int count;
 };
@@ -108,85 +105,57 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
-{
-	pmdval_t v = pmd_val(*pmd);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
-	set_pmd(pmd, __pmd(v));
-}
-
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
-{
-	pteval_t v = pte_val(*pte);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
-	set_pte_atomic(pte, __pte(v));
-}
-
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static void set_page_present(unsigned long addr, bool present,
+							unsigned int *pglevel)
 {
+	pteval_t pteval;
+	pmdval_t pmdval;
 	unsigned int level;
+	pmd_t *pmd;
 	pte_t *pte = lookup_address(addr, &level);
 
 	if (!pte) {
 		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
-		return -1;
+		return;
 	}
 
+	if (pglevel)
+		*pglevel = level;
+
 	switch (level) {
 	case PG_LEVEL_2M:
-		set_pmd_presence((pmd_t *)pte, present, old);
+		pmd = (pmd_t *)pte;
+		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+		if (present)
+			pmdval |= _PAGE_PRESENT;
+		set_pmd(pmd, __pmd(pmdval));
 		break;
+
 	case PG_LEVEL_4K:
-		set_pte_presence(pte, present, old);
+		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+		if (present)
+			pteval |= _PAGE_PRESENT;
+		set_pte_atomic(pte, __pte(pteval));
 		break;
+
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
-		return -1;
+		return;
 	}
 
 	__flush_tlb_one(addr);
-	return 0;
 }
 
-/*
- * Mark the given page as not present. Access to it will trigger a fault.
- *
- * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
- * protection is ignored here. RCU read lock is assumed held, so the struct
- * will not disappear unexpectedly. Furthermore, the caller must guarantee,
- * that double arming the same virtual address (page) cannot occur.
- *
- * Double disarming on the other hand is allowed, and may occur when a fault
- * and mmiotrace shutdown happen simultaneously.
- */
-static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
 {
-	int ret;
-	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
-	if (f->armed) {
-		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
-					f->page, f->count, f->old_presence);
-	}
-	ret = set_page_presence(f->page, false, &f->old_presence);
-	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
-	f->armed = true;
-	return ret;
+	set_page_present(page & PAGE_MASK, false, pglevel);
 }
 
-/** Restore the given page to saved presence state. */
-static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+/** Mark the given page as present. */
+static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
 {
-	bool tmp;
-	int ret = set_page_presence(f->page, f->old_presence, &tmp);
-	WARN_ONCE(ret < 0,
-			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
-	f->armed = false;
+	set_page_present(page & PAGE_MASK, true, pglevel);
 }
 
 /*
@@ -233,32 +202,28 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
+		disarm_kmmio_fault_page(faultpage->page, NULL);
 		if (addr == ctx->addr) {
 			/*
-			 * A second fault on the same page means some other
-			 * condition needs handling by do_page_fault(), the
-			 * page really not being present is the most common.
+			 * On SMP we sometimes get recursive probe hits on the
+			 * same address. Context is already saved, fall out.
 			 */
-			pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
-					addr, smp_processor_id());
-
-			if (!faultpage->old_presence)
-				pr_info("kmmio: unexpected secondary hit for "
-					"address 0x%08lx on CPU %d.\n", addr,
-					smp_processor_id());
-		} else {
-			/*
-			 * Prevent overwriting already in-flight context.
-			 * This should not happen, let's hope disarming at
-			 * least prevents a panic.
-			 */
-			pr_emerg("kmmio: recursive probe hit on CPU %d, "
+			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+						"address 0x%08lx.\n",
+						smp_processor_id(), addr);
+			ret = 1;
+			goto no_kmmio_ctx;
+		}
+		/*
+		 * Prevent overwriting already in-flight context.
+		 * This should not happen, let's hope disarming at least
+		 * prevents a panic.
+		 */
+		pr_emerg("kmmio: recursive probe hit on CPU %d, "
 					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
-			pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
-						ctx->addr);
-			disarm_kmmio_fault_page(faultpage);
-		}
+		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+					ctx->addr);
 		goto no_kmmio_ctx;
 	}
 	ctx->active++;
@@ -279,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	regs->flags &= ~X86_EFLAGS_IF;
 
 	/* Now we set present bit in PTE and single step. */
-	disarm_kmmio_fault_page(ctx->fpage);
+	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	/*
 	 * If another cpu accesses the same page while we are stepping,
@@ -310,7 +275,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_warning("kmmio: spurious debug trap on CPU %d.\n",
+		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
@@ -318,11 +283,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	/* Prevent racing against release_kmmio_fault_page(). */
-	spin_lock(&kmmio_lock);
-	if (ctx->fpage->count)
-		arm_kmmio_fault_page(ctx->fpage);
-	spin_unlock(&kmmio_lock);
+	arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	regs->flags |= ctx->saved_flags;
@@ -354,25 +315,21 @@ static int add_kmmio_fault_page(unsigned long page)
 	f = get_kmmio_fault_page(page);
 	if (f) {
 		if (!f->count)
-			arm_kmmio_fault_page(f);
+			arm_kmmio_fault_page(f->page, NULL);
 		f->count++;
 		return 0;
 	}
 
-	f = kzalloc(sizeof(*f), GFP_ATOMIC);
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
 	if (!f)
 		return -1;
 
 	f->count = 1;
 	f->page = page;
-
-	if (arm_kmmio_fault_page(f)) {
-		kfree(f);
-		return -1;
-	}
-
 	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
+	arm_kmmio_fault_page(f->page, NULL);
+
 	return 0;
 }
 
@@ -390,7 +347,7 @@ static void release_kmmio_fault_page(unsigned long page,
 	f->count--;
 	BUG_ON(f->count < 0);
 	if (!f->count) {
-		disarm_kmmio_fault_page(f);
+		disarm_kmmio_fault_page(f->page, NULL);
 		f->release_next = *release_list;
 		*release_list = f;
 	}
diff --git a/trunk/arch/x86/mm/testmmiotrace.c b/trunk/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..ab50a8d7402c 100644
--- a/trunk/arch/x86/mm/testmmiotrace.c
+++ b/trunk/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
 /*
- * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
+ * Written by Pekka Paalanen, 2008 <pq@iki.fi>
  */
 #include <linux/module.h>
 #include <linux/io.h>
@@ -9,74 +9,35 @@
 
 static unsigned long mmio_address;
 module_param(mmio_address, ulong, 0);
-MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
-				"(or 8 MB if read_far is non-zero).");
-
-static unsigned long read_far = 0x400100;
-module_param(read_far, ulong, 0);
-MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
-				"(default: 0x400100).");
-
-static unsigned v16(unsigned i)
-{
-	return i * 12 + 7;
-}
-
-static unsigned v32(unsigned i)
-{
-	return i * 212371 + 13;
-}
+MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
 
 static void do_write_test(void __iomem *p)
 {
 	unsigned int i;
-	pr_info(MODULE_NAME ": write test.\n");
 	mmiotrace_printk("Write test.\n");
-
 	for (i = 0; i < 256; i++)
 		iowrite8(i, p + i);
-
 	for (i = 1024; i < (5 * 1024); i += 2)
-		iowrite16(v16(i), p + i);
-
+		iowrite16(i * 12 + 7, p + i);
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		iowrite32(v32(i), p + i);
+		iowrite32(i * 212371 + 13, p + i);
 }
 
 static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
-	unsigned errs[3] = { 0 };
-	pr_info(MODULE_NAME ": read test.\n");
 	mmiotrace_printk("Read test.\n");
-
 	for (i = 0; i < 256; i++)
-		if (ioread8(p + i) != i)
-			++errs[0];
-
+		ioread8(p + i);
 	for (i = 1024; i < (5 * 1024); i += 2)
-		if (ioread16(p + i) != v16(i))
-			++errs[1];
-
+		ioread16(p + i);
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		if (ioread32(p + i) != v32(i))
-			++errs[2];
-
-	mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
-						errs[0], errs[1], errs[2]);
+		ioread32(p + i);
 }
 
-static void do_read_far_test(void __iomem *p)
+static void do_test(void)
 {
-	pr_info(MODULE_NAME ": read far test.\n");
-	mmiotrace_printk("Read far test.\n");
-
-	ioread32(p + read_far);
-}
-
-static void do_test(unsigned long size)
-{
-	void __iomem *p = ioremap_nocache(mmio_address, size);
+	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
 	if (!p) {
 		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
@@ -84,15 +45,11 @@ static void do_test(unsigned long size)
 	mmiotrace_printk("ioremap returned %p.\n", p);
 	do_write_test(p);
 	do_read_test(p);
-	if (read_far && read_far < size - 4)
-		do_read_far_test(p);
 	iounmap(p);
 }
 
 static int __init init(void)
 {
-	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
-
 	if (mmio_address == 0) {
 		pr_err(MODULE_NAME ": you have to use the module argument "
 							"mmio_address.\n");
@@ -101,11 +58,10 @@ static int __init init(void)
 		return -ENXIO;
 	}
 
-	pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
-		"address space, and writing 16 kB of rubbish in there.\n",
-		 size >> 10, mmio_address);
-	do_test(size);
-	pr_info(MODULE_NAME ": All done.\n");
+	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+					"in PCI address space, and writing "
+					"rubbish in there.\n", mmio_address);
+	do_test();
 	return 0;
 }
 
diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c
index 82cd39a6cbd3..c52f4034c7fd 100644
--- a/trunk/arch/x86/xen/enlighten.c
+++ b/trunk/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
 
 	vcpup = &per_cpu(xen_vcpu_info, cpu);
 
-	info.mfn = arbitrary_virt_to_mfn(vcpup);
+	info.mfn = virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
 	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,10 +301,8 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = arbitrary_virt_to_mfn((void *)va);
-
+		frames[f] = virt_to_mfn(va);
 		make_lowmem_page_readonly((void *)va);
-		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -316,7 +314,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
 	struct multicall_space mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -490,7 +488,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 		break;
 
 	default: {
-		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
+		xmaddr_t maddr = virt_to_machine(&dt[entry]);
 
 		xen_mc_flush();
 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/trunk/arch/x86/xen/mmu.c b/trunk/arch/x86/xen/mmu.c
index cb6afa4ec95c..319bd40a57c2 100644
--- a/trunk/arch/x86/xen/mmu.c
+++ b/trunk/arch/x86/xen/mmu.c
@@ -276,13 +276,6 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	p2m_top[topidx][idx] = mfn;
 }
 
-unsigned long arbitrary_virt_to_mfn(void *vaddr)
-{
-	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
-
-	return PFN_DOWN(maddr.maddr);
-}
-
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
 	unsigned long address = (unsigned long)vaddr;
diff --git a/trunk/arch/x86/xen/smp.c b/trunk/arch/x86/xen/smp.c
index 8d470562ffc9..035582ae815d 100644
--- a/trunk/arch/x86/xen/smp.c
+++ b/trunk/arch/x86/xen/smp.c
@@ -219,7 +219,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 	struct vcpu_guest_context *ctxt;
 	struct desc_struct *gdt;
-	unsigned long gdt_mfn;
 
 	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
 		return 0;
@@ -249,12 +248,9 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->ldt_ents = 0;
 
 	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
-
-	gdt_mfn = arbitrary_virt_to_mfn(gdt);
 	make_lowmem_page_readonly(gdt);
-	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
 
-	ctxt->gdt_frames[0] = gdt_mfn;
+	ctxt->gdt_frames[0] = virt_to_mfn(gdt);
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
 	ctxt->user_regs.cs = __KERNEL_CS;
diff --git a/trunk/block/blktrace.c b/trunk/block/blktrace.c
index 028120a0965a..7cf9d1ff45a0 100644
--- a/trunk/block/blktrace.c
+++ b/trunk/block/blktrace.c
@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->sequence)
 		goto err;
 
-	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
 	if (!bt->msg_data)
 		goto err;
 
diff --git a/trunk/crypto/api.c b/trunk/crypto/api.c
index 38a2bc02a98c..efe77df6863f 100644
--- a/trunk/crypto/api.c
+++ b/trunk/crypto/api.c
@@ -215,19 +215,8 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask)
 	mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
 	type &= mask;
 
-	alg = crypto_alg_lookup(name, type, mask);
-	if (!alg) {
-		char tmp[CRYPTO_MAX_ALG_NAME];
-
-		request_module(name);
-
-		if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) &&
-		    snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp))
-			request_module(tmp);
-
-		alg = crypto_alg_lookup(name, type, mask);
-	}
-
+	alg = try_then_request_module(crypto_alg_lookup(name, type, mask),
+				      name);
 	if (alg)
 		return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg;
 
diff --git a/trunk/drivers/acpi/processor_perflib.c b/trunk/drivers/acpi/processor_perflib.c
index 68fd3d292799..9cc769b587ff 100644
--- a/trunk/drivers/acpi/processor_perflib.c
+++ b/trunk/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
 			continue;
 		}
 
-		if (!performance || !per_cpu_ptr(performance, i)) {
+		if (!performance || !percpu_ptr(performance, i)) {
 			retval = -EINVAL;
 			continue;
 		}
 
-		pr->performance = per_cpu_ptr(performance, i);
+		pr->performance = percpu_ptr(performance, i);
 		cpumask_set_cpu(i, pr->performance->shared_cpu_map);
 		if (acpi_processor_get_psd(pr)) {
 			retval = -EINVAL;
diff --git a/trunk/drivers/crypto/ixp4xx_crypto.c b/trunk/drivers/crypto/ixp4xx_crypto.c
index d9e751be8c5f..2d637e0fbc03 100644
--- a/trunk/drivers/crypto/ixp4xx_crypto.c
+++ b/trunk/drivers/crypto/ixp4xx_crypto.c
@@ -457,12 +457,10 @@ static int init_ixp_crypto(void)
 	if (!ctx_pool) {
 		goto err;
 	}
-	ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0,
-				 "ixp_crypto:out", NULL);
+	ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0);
 	if (ret)
 		goto err;
-	ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0,
-				 "ixp_crypto:in", NULL);
+	ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0);
 	if (ret) {
 		qmgr_release_queue(SEND_QID);
 		goto err;
diff --git a/trunk/drivers/crypto/padlock-aes.c b/trunk/drivers/crypto/padlock-aes.c
index 3f0fdd18255d..856b3cc25583 100644
--- a/trunk/drivers/crypto/padlock-aes.c
+++ b/trunk/drivers/crypto/padlock-aes.c
@@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michal Ludvig");
 
-MODULE_ALIAS("aes-all");
+MODULE_ALIAS("aes");
diff --git a/trunk/drivers/crypto/padlock-sha.c b/trunk/drivers/crypto/padlock-sha.c
index a2c8e8514b63..a7fbadebf623 100644
--- a/trunk/drivers/crypto/padlock-sha.c
+++ b/trunk/drivers/crypto/padlock-sha.c
@@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support.");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michal Ludvig");
 
-MODULE_ALIAS("sha1-all");
-MODULE_ALIAS("sha256-all");
+MODULE_ALIAS("sha1");
+MODULE_ALIAS("sha256");
 MODULE_ALIAS("sha1-padlock");
 MODULE_ALIAS("sha256-padlock");
diff --git a/trunk/drivers/dma/iop-adma.c b/trunk/drivers/dma/iop-adma.c
index 647374acba94..ea5440dd10dc 100644
--- a/trunk/drivers/dma/iop-adma.c
+++ b/trunk/drivers/dma/iop-adma.c
@@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma");
 
 static struct platform_driver iop_adma_driver = {
 	.probe		= iop_adma_probe,
-	.remove		= __devexit_p(iop_adma_remove),
+	.remove		= iop_adma_remove,
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= "iop-adma",
diff --git a/trunk/drivers/dma/mv_xor.c b/trunk/drivers/dma/mv_xor.c
index 5d5d5b31867f..d35cbd1ff0b3 100644
--- a/trunk/drivers/dma/mv_xor.c
+++ b/trunk/drivers/dma/mv_xor.c
@@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp,
 
 static struct platform_driver mv_xor_driver = {
 	.probe		= mv_xor_probe,
-	.remove		= __devexit_p(mv_xor_remove),
+	.remove		= mv_xor_remove,
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= MV_XOR_NAME,
diff --git a/trunk/drivers/gpu/drm/drm_stub.c b/trunk/drivers/gpu/drm/drm_stub.c
index 7c8b15b22bf2..096e2a37446d 100644
--- a/trunk/drivers/gpu/drm/drm_stub.c
+++ b/trunk/drivers/gpu/drm/drm_stub.c
@@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data,
 	    file_priv->minor->master != file_priv->master) {
 		mutex_lock(&dev->struct_mutex);
 		file_priv->minor->master = drm_master_get(file_priv->master);
-		mutex_unlock(&dev->struct_mutex);
+		mutex_lock(&dev->struct_mutex);
 	}
 
 	return 0;
diff --git a/trunk/drivers/i2c/busses/i2c-mv64xxx.c b/trunk/drivers/i2c/busses/i2c-mv64xxx.c
index 7f186bbcb99d..eeda276f8f16 100644
--- a/trunk/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/trunk/drivers/i2c/busses/i2c-mv64xxx.c
@@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd,
 	return 0;
 }
 
-static void
+static void __devexit
 mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data)
 {
 	if (drv_data->reg_base) {
@@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev)
 
 static struct platform_driver mv64xxx_i2c_driver = {
 	.probe	= mv64xxx_i2c_probe,
-	.remove	= __devexit_p(mv64xxx_i2c_remove),
+	.remove	= mv64xxx_i2c_remove,
 	.driver	= {
 		.owner	= THIS_MODULE,
 		.name	= MV64XXX_I2C_CTLR_NAME,
diff --git a/trunk/drivers/mtd/nand/orion_nand.c b/trunk/drivers/mtd/nand/orion_nand.c
index c2dfd3ea353d..917cf8d3ae95 100644
--- a/trunk/drivers/mtd/nand/orion_nand.c
+++ b/trunk/drivers/mtd/nand/orion_nand.c
@@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev)
 
 static struct platform_driver orion_nand_driver = {
 	.probe		= orion_nand_probe,
-	.remove		= __devexit_p(orion_nand_remove),
+	.remove		= orion_nand_remove,
 	.driver		= {
 		.name	= "orion_nand",
 		.owner	= THIS_MODULE,
diff --git a/trunk/drivers/net/arm/Makefile b/trunk/drivers/net/arm/Makefile
index 811a3ccd14c1..c69c0cdba4a2 100644
--- a/trunk/drivers/net/arm/Makefile
+++ b/trunk/drivers/net/arm/Makefile
@@ -4,7 +4,7 @@
 #
 
 obj-$(CONFIG_ARM_AM79C961A)	+= am79c961a.o
-obj-$(CONFIG_ARM_ETHERH)	+= etherh.o
+obj-$(CONFIG_ARM_ETHERH)	+= etherh.o ../8390.o
 obj-$(CONFIG_ARM_ETHER3)	+= ether3.o
 obj-$(CONFIG_ARM_ETHER1)	+= ether1.o
 obj-$(CONFIG_ARM_AT91_ETHER)	+= at91_ether.o
diff --git a/trunk/drivers/net/arm/etherh.c b/trunk/drivers/net/arm/etherh.c
index f52f668c49bf..54b52e5b1821 100644
--- a/trunk/drivers/net/arm/etherh.c
+++ b/trunk/drivers/net/arm/etherh.c
@@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = {
 	.ndo_open		= etherh_open,
 	.ndo_stop		= etherh_close,
 	.ndo_set_config		= etherh_set_config,
-	.ndo_start_xmit		= __ei_start_xmit,
-	.ndo_tx_timeout		= __ei_tx_timeout,
-	.ndo_get_stats		= __ei_get_stats,
-	.ndo_set_multicast_list = __ei_set_multicast_list,
+	.ndo_start_xmit		= ei_start_xmit,
+	.ndo_tx_timeout		= ei_tx_timeout,
+	.ndo_get_stats		= ei_get_stats,
+	.ndo_set_multicast_list = ei_set_multicast_list,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_change_mtu		= eth_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller	= __ei_poll,
+	.ndo_poll_controller	= ei_poll,
 #endif
 };
 
diff --git a/trunk/drivers/video/pxafb.c b/trunk/drivers/video/pxafb.c
index 2552b9f325ee..48ff701d3a72 100644
--- a/trunk/drivers/video/pxafb.c
+++ b/trunk/drivers/video/pxafb.c
@@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev)
 
 static struct platform_driver pxafb_driver = {
 	.probe		= pxafb_probe,
-	.remove 	= __devexit_p(pxafb_remove),
+	.remove 	= pxafb_remove,
 	.suspend	= pxafb_suspend,
 	.resume		= pxafb_resume,
 	.driver		= {
diff --git a/trunk/include/linux/bootmem.h b/trunk/include/linux/bootmem.h
index 455d83219fae..95837bfb5256 100644
--- a/trunk/include/linux/bootmem.h
+++ b/trunk/include/linux/bootmem.h
@@ -65,20 +65,23 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
 #define BOOTMEM_DEFAULT		0
 #define BOOTMEM_EXCLUSIVE	(1<<0)
 
-extern int reserve_bootmem(unsigned long addr,
-			   unsigned long size,
-			   int flags);
 extern int reserve_bootmem_node(pg_data_t *pgdat,
-				unsigned long physaddr,
-				unsigned long size,
-				int flags);
+				 unsigned long physaddr,
+				 unsigned long size,
+				 int flags);
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
+#endif
 
-extern void *__alloc_bootmem(unsigned long size,
+extern void *__alloc_bootmem_nopanic(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
-extern void *__alloc_bootmem_nopanic(unsigned long size,
+extern void *__alloc_bootmem(unsigned long size,
 				     unsigned long align,
 				     unsigned long goal);
+extern void *__alloc_bootmem_low(unsigned long size,
+				 unsigned long align,
+				 unsigned long goal);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
@@ -87,35 +90,30 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
-extern void *__alloc_bootmem_low(unsigned long size,
-				 unsigned long align,
-				 unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
 				      unsigned long goal);
-
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
 	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low(x) \
+	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
 	__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low_pages(x) \
+	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
-	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
diff --git a/trunk/include/linux/percpu.h b/trunk/include/linux/percpu.h
index 545b068bcb70..3577ffd90d45 100644
--- a/trunk/include/linux/percpu.h
+++ b/trunk/include/linux/percpu.h
@@ -76,98 +76,52 @@
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
-
-/* minimum unit size, also is the maximum supported allocation size */
-#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
-
-/*
- * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
- * back on the first chunk if arch is manually allocating and mapping
- * it for faster access (as a part of large page mapping for example).
- * Note that dynamic percpu allocator covers both static and dynamic
- * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
- *
- * On typical configuration with modules, the following values leave
- * about 8k of free space on the first chunk after boot on both x86_32
- * and 64 when module support is enabled.  When module support is
- * disabled, it's much tighter.
- */
-#ifndef PERCPU_DYNAMIC_RESERVE
-#  if BITS_PER_LONG > 32
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
-#    endif
-#  else
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
-#    endif
-#  endif
-#endif	/* PERCPU_DYNAMIC_RESERVE */
-
-extern void *pcpu_base_addr;
-
-typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
-typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
-
-extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size, size_t unit_size,
-					size_t free_size, void *base_addr,
-					pcpu_populate_pte_fn_t populate_pte_fn);
-
-/*
- * Use this to get to a cpu's version of the per-cpu object
- * dynamically allocated. Non-atomic access to the current CPU's
- * version should probably be combined with get_cpu()/put_cpu().
- */
-#define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
-
-#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
-
 struct percpu_data {
 	void *ptrs[1];
 };
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-
-#define per_cpu_ptr(ptr, cpu)						\
-({									\
-        struct percpu_data *__p = __percpu_disguise(ptr);		\
-        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+/* 
+ * Use this to get to a cpu's version of the per-cpu object dynamically
+ * allocated. Non-atomic access to the current CPU's version should
+ * probably be combined with get_cpu()/put_cpu().
+ */ 
+#define percpu_ptr(ptr, cpu)                              \
+({                                                        \
+        struct percpu_data *__p = __percpu_disguise(ptr); \
+        (__typeof__(ptr))__p->ptrs[(cpu)];	          \
 })
 
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
-
-extern void *__alloc_percpu(size_t size, size_t align);
-extern void free_percpu(void *__pdata);
+extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
+extern void percpu_free(void *__pdata);
 
 #else /* CONFIG_SMP */
 
-#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static inline void *__alloc_percpu(size_t size, size_t align)
+static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
-	/*
-	 * Can't easily make larger alignment work with kmalloc.  WARN
-	 * on it.  Larger alignment should only be used for module
-	 * percpu sections on SMP for which this path isn't used.
-	 */
-	WARN_ON_ONCE(align > SMP_CACHE_BYTES);
-	return kzalloc(size, GFP_KERNEL);
+	return kzalloc(size, gfp);
 }
 
-static inline void free_percpu(void *p)
+static inline void percpu_free(void *__pdata)
 {
-	kfree(p);
+	kfree(__pdata);
 }
 
 #endif /* CONFIG_SMP */
 
-#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
-						       __alignof__(type))
+#define percpu_alloc_mask(size, gfp, mask) \
+	__percpu_alloc_mask((size), (gfp), &(mask))
+
+#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
+
+/* (legacy) interface for use without CPU hotplug handling */
+
+#define __alloc_percpu(size)	percpu_alloc_mask((size), GFP_KERNEL, \
+						  cpu_possible_map)
+#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
+#define free_percpu(ptr)	percpu_free((ptr))
+#define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/trunk/include/linux/rcuclassic.h b/trunk/include/linux/rcuclassic.h
index 80044a4f3ab9..f3f697df1d71 100644
--- a/trunk/include/linux/rcuclassic.h
+++ b/trunk/include/linux/rcuclassic.h
@@ -181,10 +181,4 @@ extern long rcu_batches_completed_bh(void);
 #define rcu_enter_nohz()	do { } while (0)
 #define rcu_exit_nohz()		do { } while (0)
 
-/* A context switch is a grace period for rcuclassic. */
-static inline int rcu_blocking_is_gp(void)
-{
-	return num_online_cpus() == 1;
-}
-
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/trunk/include/linux/rcupdate.h b/trunk/include/linux/rcupdate.h
index 528343e6da51..921340a7b71c 100644
--- a/trunk/include/linux/rcupdate.h
+++ b/trunk/include/linux/rcupdate.h
@@ -52,9 +52,6 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
-/* Internal to kernel, but needed by rcupreempt.h. */
-extern int rcu_scheduler_active;
-
 #if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
 #elif defined(CONFIG_TREE_RCU)
@@ -268,7 +265,6 @@ extern void rcu_barrier_sched(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
-extern void rcu_scheduler_starting(void);
 extern int rcu_needs_cpu(int cpu);
 
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/trunk/include/linux/rcupreempt.h b/trunk/include/linux/rcupreempt.h
index 74304b4538d8..3e05c09b54a2 100644
--- a/trunk/include/linux/rcupreempt.h
+++ b/trunk/include/linux/rcupreempt.h
@@ -142,19 +142,4 @@ static inline void rcu_exit_nohz(void)
 #define rcu_exit_nohz()		do { } while (0)
 #endif /* CONFIG_NO_HZ */
 
-/*
- * A context switch is a grace period for rcupreempt synchronize_rcu()
- * only during early boot, before the scheduler has been initialized.
- * So, how the heck do we get a context switch?  Well, if the caller
- * invokes synchronize_rcu(), they are willing to accept a context
- * switch, so we simply pretend that one happened.
- *
- * After boot, there might be a blocked or preempted task in an RCU
- * read-side critical section, so we cannot then take the fastpath.
- */
-static inline int rcu_blocking_is_gp(void)
-{
-	return num_online_cpus() == 1 && !rcu_scheduler_active;
-}
-
 #endif /* __LINUX_RCUPREEMPT_H */
diff --git a/trunk/include/linux/rcutree.h b/trunk/include/linux/rcutree.h
index a722fb67bb2d..d4368b7975c3 100644
--- a/trunk/include/linux/rcutree.h
+++ b/trunk/include/linux/rcutree.h
@@ -326,10 +326,4 @@ static inline void rcu_exit_nohz(void)
 }
 #endif /* CONFIG_NO_HZ */
 
-/* A context switch is a grace period for rcutree. */
-static inline int rcu_blocking_is_gp(void)
-{
-	return num_online_cpus() == 1;
-}
-
 #endif /* __LINUX_RCUTREE_H */
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index a7c7698583bb..f0a50b20e8a0 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -2303,13 +2303,9 @@ extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				      long rt_period_us);
 extern long sched_group_rt_period(struct task_group *tg);
-extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
 #endif
 
-extern int task_can_switch_user(struct user_struct *up,
-					struct task_struct *tsk);
-
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
diff --git a/trunk/include/linux/vmalloc.h b/trunk/include/linux/vmalloc.h
index a43ebec3a7b9..9c0890c7a06a 100644
--- a/trunk/include/linux/vmalloc.h
+++ b/trunk/include/linux/vmalloc.h
@@ -95,9 +95,6 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
-extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
-				    pgprot_t prot, struct page **pages);
-extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
 /* Allocate/destroy a 'vmalloc' VM area. */
@@ -113,6 +110,5 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
-extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/trunk/init/main.c b/trunk/init/main.c
index 6bf83afd654d..6441083f8273 100644
--- a/trunk/init/main.c
+++ b/trunk/init/main.c
@@ -98,7 +98,7 @@ static inline void mark_rodata_ro(void) { }
 extern void tc_init(void);
 #endif
 
-enum system_states system_state __read_mostly;
+enum system_states system_state;
 EXPORT_SYMBOL(system_state);
 
 /*
@@ -464,7 +464,6 @@ static noinline void __init_refok rest_init(void)
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
-	rcu_scheduler_starting();
 	preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
diff --git a/trunk/kernel/module.c b/trunk/kernel/module.c
index 1f0657ae555b..ba22484a987e 100644
--- a/trunk/kernel/module.c
+++ b/trunk/kernel/module.c
@@ -51,7 +51,6 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
-#include <linux/percpu.h>
 
 #if 0
 #define DEBUGP printk
@@ -367,34 +366,6 @@ static struct module *find_module(const char *name)
 }
 
 #ifdef CONFIG_SMP
-
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
-
-static void *percpu_modalloc(unsigned long size, unsigned long align,
-			     const char *name)
-{
-	void *ptr;
-
-	if (align > PAGE_SIZE) {
-		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-		       name, align, PAGE_SIZE);
-		align = PAGE_SIZE;
-	}
-
-	ptr = __alloc_percpu(size, align);
-	if (!ptr)
-		printk(KERN_WARNING
-		       "Could not allocate %lu bytes percpu data\n", size);
-	return ptr;
-}
-
-static void percpu_modfree(void *freeme)
-{
-	free_percpu(freeme);
-}
-
-#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
-
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -509,6 +480,21 @@ static void percpu_modfree(void *freeme)
 	}
 }
 
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs,
+				 const char *secstrings)
+{
+	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
 static int percpu_modinit(void)
 {
 	pcpu_num_used = 2;
@@ -527,26 +513,7 @@ static int percpu_modinit(void)
 	return 0;
 }
 __initcall(percpu_modinit);
-
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
-
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-				 Elf_Shdr *sechdrs,
-				 const char *secstrings)
-{
-	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-}
-
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
-}
-
 #else /* ... !CONFIG_SMP */
-
 static inline void *percpu_modalloc(unsigned long size, unsigned long align,
 				    const char *name)
 {
@@ -568,7 +535,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 	/* pcpusec should be 0, and size of that section should be 0. */
 	BUG_ON(size != 0);
 }
-
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)	\
diff --git a/trunk/kernel/rcuclassic.c b/trunk/kernel/rcuclassic.c
index 654c640a6b9c..bd5a9003497c 100644
--- a/trunk/kernel/rcuclassic.c
+++ b/trunk/kernel/rcuclassic.c
@@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
-	    (idle_cpu(cpu) && rcu_scheduler_active &&
-	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
 		 * Get here if this CPU took its interrupt from user
diff --git a/trunk/kernel/rcupdate.c b/trunk/kernel/rcupdate.c
index cae8a059cf47..d92a76a881aa 100644
--- a/trunk/kernel/rcupdate.c
+++ b/trunk/kernel/rcupdate.c
@@ -44,7 +44,6 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include <linux/kernel_stat.h>
 
 enum rcu_barrier {
 	RCU_BARRIER_STD,
@@ -56,7 +55,6 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
-int rcu_scheduler_active __read_mostly;
 
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
@@ -82,10 +80,6 @@ void wakeme_after_rcu(struct rcu_head  *head)
 void synchronize_rcu(void)
 {
 	struct rcu_synchronize rcu;
-
-	if (rcu_blocking_is_gp())
-		return;
-
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
@@ -181,9 +175,3 @@ void __init rcu_init(void)
 	__rcu_init();
 }
 
-void rcu_scheduler_starting(void)
-{
-	WARN_ON(num_online_cpus() != 1);
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
-}
diff --git a/trunk/kernel/rcupreempt.c b/trunk/kernel/rcupreempt.c
index 5d59e850fb71..33cfc50781f9 100644
--- a/trunk/kernel/rcupreempt.c
+++ b/trunk/kernel/rcupreempt.c
@@ -1181,9 +1181,6 @@ void __synchronize_sched(void)
 {
 	struct rcu_synchronize rcu;
 
-	if (num_online_cpus() == 1)
-		return;  /* blocking is gp if only one CPU! */
-
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/trunk/kernel/rcutree.c b/trunk/kernel/rcutree.c
index 97ce31579ec0..b2fd602a6f6f 100644
--- a/trunk/kernel/rcutree.c
+++ b/trunk/kernel/rcutree.c
@@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
-	    (idle_cpu(cpu) && rcu_scheduler_active &&
-	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
 		 * Get here if this CPU took its interrupt from user
diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c
index 0a76d0b6f215..7d97ff7c4478 100644
--- a/trunk/kernel/sched.c
+++ b/trunk/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
@@ -9219,16 +9219,6 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
-
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
-{
-	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
-		return 0;
-
-	return 1;
-}
-
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9322,7 +9312,8 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
@@ -9485,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
 #ifndef CONFIG_64BIT
@@ -9504,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 
 #ifndef CONFIG_64BIT
 	/*
@@ -9600,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	ca = task_ca(tsk);
 
 	for (; ca; ca = ca->parent) {
-		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
diff --git a/trunk/kernel/stop_machine.c b/trunk/kernel/stop_machine.c
index 74541ca49536..0cd415ee62a2 100644
--- a/trunk/kernel/stop_machine.c
+++ b/trunk/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	 * doesn't hit this CPU until we're ready. */
 	get_cpu();
 	for_each_online_cpu(i) {
-		sm_work = per_cpu_ptr(stop_machine_work, i);
+		sm_work = percpu_ptr(stop_machine_work, i);
 		INIT_WORK(sm_work, stop_cpu);
 		queue_work_on(i, stop_machine_wq, sm_work);
 	}
diff --git a/trunk/kernel/sys.c b/trunk/kernel/sys.c
index 37f458e6882a..f145c415bc16 100644
--- a/trunk/kernel/sys.c
+++ b/trunk/kernel/sys.c
@@ -559,7 +559,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 	abort_creds(new);
 	return retval;
 }
-
+  
 /*
  * change the user struct in a credentials set to match the new UID
  */
@@ -571,11 +571,6 @@ static int set_user(struct cred *new)
 	if (!new_user)
 		return -EAGAIN;
 
-	if (!task_can_switch_user(new_user, current)) {
-		free_uid(new_user);
-		return -EINVAL;
-	}
-
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != INIT_USER) {
@@ -636,11 +631,10 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 			goto error;
 	}
 
-	if (new->uid != old->uid) {
-		retval = set_user(new);
-		if (retval < 0)
-			goto error;
-	}
+	retval = -EAGAIN;
+	if (new->uid != old->uid && set_user(new) < 0)
+		goto error;
+
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old->uid))
 		new->suid = new->euid;
@@ -686,10 +680,9 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
-		if (uid != old->uid) {
-			retval = set_user(new);
-			if (retval < 0)
-				goto error;
+		if (uid != old->uid && set_user(new) < 0) {
+			retval = -EAGAIN;
+			goto error;
 		}
 	} else if (uid != old->uid && uid != new->suid) {
 		goto error;
@@ -741,13 +734,11 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 			goto error;
 	}
 
+	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
 		new->uid = ruid;
-		if (ruid != old->uid) {
-			retval = set_user(new);
-			if (retval < 0)
-				goto error;
-		}
+		if (ruid != old->uid && set_user(new) < 0)
+			goto error;
 	}
 	if (euid != (uid_t) -1)
 		new->euid = euid;
diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c
index 6a9b696128c8..3551ac742395 100644
--- a/trunk/kernel/user.c
+++ b/trunk/kernel/user.c
@@ -362,24 +362,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
 
 #endif
 
-#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
-/*
- * We need to check if a setuid can take place. This function should be called
- * before successfully completing the setuid.
- */
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-
-	return sched_rt_can_attach(up->tg, tsk);
-
-}
-#else
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-	return 1;
-}
-#endif
-
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile
index 818569b68f46..72255be57f89 100644
--- a/trunk/mm/Makefile
+++ b/trunk/mm/Makefile
@@ -30,10 +30,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
-obj-$(CONFIG_SMP) += percpu.o
-else
 obj-$(CONFIG_SMP) += allocpercpu.o
-endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/trunk/mm/allocpercpu.c b/trunk/mm/allocpercpu.c
index 3653c570232b..4297bc41bfd2 100644
--- a/trunk/mm/allocpercpu.c
+++ b/trunk/mm/allocpercpu.c
@@ -99,51 +99,45 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
 /**
- * alloc_percpu - initial setup of per-cpu data
+ * percpu_alloc_mask - initial setup of per-cpu data
  * @size: size of per-cpu object
- * @align: alignment
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-data for cpu's selected through mask bits
  *
- * Allocate dynamic percpu area.  Percpu objects are populated with
- * zeroed buffers.
+ * Populating per-cpu data for all online cpu's would be a typical use case,
+ * which is simplified by the percpu_alloc() wrapper.
+ * Per-cpu objects are populated with zeroed buffers.
  */
-void *__alloc_percpu(size_t size, size_t align)
+void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
 	/*
 	 * We allocate whole cache lines to avoid false sharing
 	 */
 	size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-	void *pdata = kzalloc(sz, GFP_KERNEL);
+	void *pdata = kzalloc(sz, gfp);
 	void *__pdata = __percpu_disguise(pdata);
 
-	/*
-	 * Can't easily make larger alignment work with kmalloc.  WARN
-	 * on it.  Larger alignment should only be used for module
-	 * percpu sections on SMP for which this path isn't used.
-	 */
-	WARN_ON_ONCE(align > __alignof__(unsigned long long));
-
 	if (unlikely(!pdata))
 		return NULL;
-	if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
-					   &cpu_possible_map)))
+	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
 		return __pdata;
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__alloc_percpu);
+EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
 
 /**
- * free_percpu - final cleanup of per-cpu data
+ * percpu_free - final cleanup of per-cpu data
  * @__pdata: object to clean up
  *
  * We simply clean up any per-cpu object left. No need for the client to
  * track and specify through a bis mask which per-cpu objects are to free.
  */
-void free_percpu(void *__pdata)
+void percpu_free(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
 	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
 	kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(free_percpu);
+EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/trunk/mm/bootmem.c b/trunk/mm/bootmem.c
index daf92713f7de..51a0ccf61e0e 100644
--- a/trunk/mm/bootmem.c
+++ b/trunk/mm/bootmem.c
@@ -382,6 +382,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
  * reserve_bootmem - mark a page range as usable
  * @addr: starting address of the range
@@ -402,6 +403,7 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 
 	return mark_bootmem(start, end, 1, flags);
 }
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
 			unsigned long step)
@@ -427,8 +429,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 }
 
 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
-					unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
+				unsigned long size, unsigned long align,
+				unsigned long goal, unsigned long limit)
 {
 	unsigned long fallback = 0;
 	unsigned long min, max, start, sidx, midx, step;
@@ -528,34 +530,17 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
 	return NULL;
 }
 
-static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
-					unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
-{
-#ifdef CONFIG_HAVE_ARCH_BOOTMEM
-	bootmem_data_t *p_bdata;
-
-	p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
-	if (p_bdata)
-		return alloc_bootmem_core(p_bdata, size, align, goal, limit);
-#endif
-	return NULL;
-}
-
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
 					unsigned long limit)
 {
 	bootmem_data_t *bdata;
-	void *region;
 
 restart:
-	region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
-	if (region)
-		return region;
-
 	list_for_each_entry(bdata, &bdata_list, list) {
+		void *region;
+
 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
 			continue;
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -633,10 +618,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 {
 	void *ptr;
 
-	ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
-	if (ptr)
-		return ptr;
-
 	ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
@@ -693,10 +674,6 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
-	if (ptr)
-		return ptr;
-
 	ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
diff --git a/trunk/mm/percpu.c b/trunk/mm/percpu.c
deleted file mode 100644
index 3d0f5456827c..000000000000
--- a/trunk/mm/percpu.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- * linux/mm/percpu.c - percpu memory allocator
- *
- * Copyright (C) 2009		SUSE Linux Products GmbH
- * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
- *
- * This is percpu allocator which can handle both static and dynamic
- * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running).  Unit grows as
- * necessary and all units grow or shrink in unison.  When a chunk is
- * filled up, another chunk is allocated.  ie. in vmalloc area
- *
- *  c0                           c1                         c2
- *  -------------------          -------------------        ------------
- * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
- *  -------------------  ......  -------------------  ....  ------------
- *
- * Allocation is done in offset-size areas of single unit space.  Ie,
- * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers UNIT_SIZE apart.
- *
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes.  The allocator organizes chunks into lists
- * according to free size and tries to allocate from the fullest one.
- * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be eqaul to or larger than the maximum contiguous
- * area in the chunk.  This helps the allocator not to iterate the
- * chunk maps unnecessarily.
- *
- * Allocation state in each chunk is kept using an array of integers
- * on chunk->map.  A positive value in the map represents a free
- * region and negative allocated.  Allocation inside a chunk is done
- * by scanning this map sequentially and serving the first matching
- * entry.  This is mostly copied from the percpu_modalloc() allocator.
- * Chunks are also linked into a rb tree to ease address to chunk
- * mapping during free.
- *
- * To use this allocator, arch code should do the followings.
- *
- * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
- *
- * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
- *   regular address to percpu pointer and back
- *
- * - use pcpu_setup_first_chunk() during percpu area initialization to
- *   setup the first chunk containing the kernel static percpu area
- */
-
-#include <linux/bitmap.h>
-#include <linux/bootmem.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/pfn.h>
-#include <linux/rbtree.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
-#define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
-
-struct pcpu_chunk {
-	struct list_head	list;		/* linked to pcpu_slot lists */
-	struct rb_node		rb_node;	/* key is chunk->vm->addr */
-	int			free_size;	/* free bytes in the chunk */
-	int			contig_hint;	/* max contiguous size hint */
-	struct vm_struct	*vm;		/* mapped vmalloc region */
-	int			map_used;	/* # of map entries used */
-	int			map_alloc;	/* # of map entries allocated */
-	int			*map;		/* allocation map */
-	bool			immutable;	/* no [de]population allowed */
-	struct page		*page[];	/* #cpus * UNIT_PAGES */
-};
-
-static int pcpu_unit_pages __read_mostly;
-static int pcpu_unit_size __read_mostly;
-static int pcpu_chunk_size __read_mostly;
-static int pcpu_nr_slots __read_mostly;
-static size_t pcpu_chunk_struct_size __read_mostly;
-
-/* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr __read_mostly;
-EXPORT_SYMBOL_GPL(pcpu_base_addr);
-
-/* the size of kernel static area */
-static int pcpu_static_size __read_mostly;
-
-/*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
- */
-static DEFINE_MUTEX(pcpu_mutex);
-
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
-static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
-
-static int __pcpu_size_to_slot(int size)
-{
-	int highbit = fls(size);	/* size is in bytes */
-	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
-}
-
-static int pcpu_size_to_slot(int size)
-{
-	if (size == pcpu_unit_size)
-		return pcpu_nr_slots - 1;
-	return __pcpu_size_to_slot(size);
-}
-
-static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
-{
-	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
-		return 0;
-
-	return pcpu_size_to_slot(chunk->free_size);
-}
-
-static int pcpu_page_idx(unsigned int cpu, int page_idx)
-{
-	return cpu * pcpu_unit_pages + page_idx;
-}
-
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
-				      unsigned int cpu, int page_idx)
-{
-	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
-}
-
-static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
-				     unsigned int cpu, int page_idx)
-{
-	return (unsigned long)chunk->vm->addr +
-		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
-}
-
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
-				     int page_idx)
-{
-	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
-}
-
-/**
- * pcpu_realloc - versatile realloc
- * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size in bytes (can be 0 for new allocations)
- * @new_size: the wanted new size in bytes (can be 0 for free)
- *
- * More robust realloc which can be used to allocate, resize or free a
- * memory area of arbitrary size.  If the needed size goes over
- * PAGE_SIZE, kernel VM is used.
- *
- * RETURNS:
- * The new pointer on success, NULL on failure.
- */
-static void *pcpu_realloc(void *p, size_t size, size_t new_size)
-{
-	void *new;
-
-	if (new_size <= PAGE_SIZE)
-		new = kmalloc(new_size, GFP_KERNEL);
-	else
-		new = vmalloc(new_size);
-	if (new_size && !new)
-		return NULL;
-
-	memcpy(new, p, min(size, new_size));
-	if (new_size > size)
-		memset(new + size, 0, new_size - size);
-
-	if (size <= PAGE_SIZE)
-		kfree(p);
-	else
-		vfree(p);
-
-	return new;
-}
-
-/**
- * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
- * @chunk: chunk of interest
- * @oslot: the previous slot it was on
- *
- * This function is called after an allocation or free changed @chunk.
- * New slot according to the changed state is determined and @chunk is
- * moved to the slot.
- */
-static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
-{
-	int nslot = pcpu_chunk_slot(chunk);
-
-	if (oslot != nslot) {
-		if (oslot < nslot)
-			list_move(&chunk->list, &pcpu_slot[nslot]);
-		else
-			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
-	}
-}
-
-static struct rb_node **pcpu_chunk_rb_search(void *addr,
-					     struct rb_node **parentp)
-{
-	struct rb_node **p = &pcpu_addr_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct pcpu_chunk *chunk;
-
-	while (*p) {
-		parent = *p;
-		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
-
-		if (addr < chunk->vm->addr)
-			p = &(*p)->rb_left;
-		else if (addr > chunk->vm->addr)
-			p = &(*p)->rb_right;
-		else
-			break;
-	}
-
-	if (parentp)
-		*parentp = parent;
-	return p;
-}
-
-/**
- * pcpu_chunk_addr_search - search for chunk containing specified address
- * @addr: address to search for
- *
- * Look for chunk which might contain @addr.  More specifically, it
- * searchs for the chunk with the highest start address which isn't
- * beyond @addr.
- *
- * RETURNS:
- * The address of the found chunk.
- */
-static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-{
-	struct rb_node *n, *parent;
-	struct pcpu_chunk *chunk;
-
-	n = *pcpu_chunk_rb_search(addr, &parent);
-	if (!n) {
-		/* no exactly matching chunk, the parent is the closest */
-		n = parent;
-		BUG_ON(!n);
-	}
-	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-
-	if (addr < chunk->vm->addr) {
-		/* the parent was the next one, look for the previous one */
-		n = rb_prev(n);
-		BUG_ON(!n);
-		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-	}
-
-	return chunk;
-}
-
-/**
- * pcpu_chunk_addr_insert - insert chunk into address rb tree
- * @new: chunk to insert
- *
- * Insert @new into address rb tree.
- */
-static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
-{
-	struct rb_node **p, *parent;
-
-	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
-	BUG_ON(*p);
-	rb_link_node(&new->rb_node, parent, p);
-	rb_insert_color(&new->rb_node, &pcpu_addr_root);
-}
-
-/**
- * pcpu_split_block - split a map block
- * @chunk: chunk of interest
- * @i: index of map block to split
- * @head: head size in bytes (can be 0)
- * @tail: tail size in bytes (can be 0)
- *
- * Split the @i'th map block into two or three blocks.  If @head is
- * non-zero, @head bytes block is inserted before block @i moving it
- * to @i+1 and reducing its size by @head bytes.
- *
- * If @tail is non-zero, the target block, which can be @i or @i+1
- * depending on @head, is reduced by @tail bytes and @tail byte block
- * is inserted after the target block.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
-{
-	int nr_extra = !!head + !!tail;
-	int target = chunk->map_used + nr_extra;
-
-	/* reallocation required? */
-	if (chunk->map_alloc < target) {
-		int new_alloc = chunk->map_alloc;
-		int *new;
-
-		while (new_alloc < target)
-			new_alloc *= 2;
-
-		new = pcpu_realloc(chunk->map,
-				   chunk->map_alloc * sizeof(new[0]),
-				   new_alloc * sizeof(new[0]));
-		if (!new)
-			return -ENOMEM;
-
-		chunk->map_alloc = new_alloc;
-		chunk->map = new;
-	}
-
-	/* insert a new subblock */
-	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
-		sizeof(chunk->map[0]) * (chunk->map_used - i));
-	chunk->map_used += nr_extra;
-
-	if (head) {
-		chunk->map[i + 1] = chunk->map[i] - head;
-		chunk->map[i++] = head;
-	}
-	if (tail) {
-		chunk->map[i++] -= tail;
-		chunk->map[i] = tail;
-	}
-	return 0;
-}
-
-/**
- * pcpu_alloc_area - allocate area from a pcpu_chunk
- * @chunk: chunk of interest
- * @size: wanted size in bytes
- * @align: wanted align
- *
- * Try to allocate @size bytes area aligned at @align from @chunk.
- * Note that this function only allocates the offset.  It doesn't
- * populate or map the area.
- *
- * RETURNS:
- * Allocated offset in @chunk on success, -errno on failure.
- */
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
-{
-	int oslot = pcpu_chunk_slot(chunk);
-	int max_contig = 0;
-	int i, off;
-
-	/*
-	 * The static chunk initially doesn't have map attached
-	 * because kmalloc wasn't available during init.  Give it one.
-	 */
-	if (unlikely(!chunk->map)) {
-		chunk->map = pcpu_realloc(NULL, 0,
-				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-		if (!chunk->map)
-			return -ENOMEM;
-
-		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-		chunk->map[chunk->map_used++] = -pcpu_static_size;
-		if (chunk->free_size)
-			chunk->map[chunk->map_used++] = chunk->free_size;
-	}
-
-	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
-		bool is_last = i + 1 == chunk->map_used;
-		int head, tail;
-
-		/* extra for alignment requirement */
-		head = ALIGN(off, align) - off;
-		BUG_ON(i == 0 && head != 0);
-
-		if (chunk->map[i] < 0)
-			continue;
-		if (chunk->map[i] < head + size) {
-			max_contig = max(chunk->map[i], max_contig);
-			continue;
-		}
-
-		/*
-		 * If head is small or the previous block is free,
-		 * merge'em.  Note that 'small' is defined as smaller
-		 * than sizeof(int), which is very small but isn't too
-		 * uncommon for percpu allocations.
-		 */
-		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
-			if (chunk->map[i - 1] > 0)
-				chunk->map[i - 1] += head;
-			else {
-				chunk->map[i - 1] -= head;
-				chunk->free_size -= head;
-			}
-			chunk->map[i] -= head;
-			off += head;
-			head = 0;
-		}
-
-		/* if tail is small, just keep it around */
-		tail = chunk->map[i] - head - size;
-		if (tail < sizeof(int))
-			tail = 0;
-
-		/* split if warranted */
-		if (head || tail) {
-			if (pcpu_split_block(chunk, i, head, tail))
-				return -ENOMEM;
-			if (head) {
-				i++;
-				off += head;
-				max_contig = max(chunk->map[i - 1], max_contig);
-			}
-			if (tail)
-				max_contig = max(chunk->map[i + 1], max_contig);
-		}
-
-		/* update hint and mark allocated */
-		if (is_last)
-			chunk->contig_hint = max_contig; /* fully scanned */
-		else
-			chunk->contig_hint = max(chunk->contig_hint,
-						 max_contig);
-
-		chunk->free_size -= chunk->map[i];
-		chunk->map[i] = -chunk->map[i];
-
-		pcpu_chunk_relocate(chunk, oslot);
-		return off;
-	}
-
-	chunk->contig_hint = max_contig;	/* fully scanned */
-	pcpu_chunk_relocate(chunk, oslot);
-
-	/*
-	 * Tell the upper layer that this chunk has no area left.
-	 * Note that this is not an error condition but a notification
-	 * to upper layer that it needs to look at other chunks.
-	 * -ENOSPC is chosen as it isn't used in memory subsystem and
-	 * matches the meaning in a way.
-	 */
-	return -ENOSPC;
-}
-
-/**
- * pcpu_free_area - free area to a pcpu_chunk
- * @chunk: chunk of interest
- * @freeme: offset of area to free
- *
- * Free area starting from @freeme to @chunk.  Note that this function
- * only modifies the allocation map.  It doesn't depopulate or unmap
- * the area.
- */
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
-{
-	int oslot = pcpu_chunk_slot(chunk);
-	int i, off;
-
-	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
-		if (off == freeme)
-			break;
-	BUG_ON(off != freeme);
-	BUG_ON(chunk->map[i] > 0);
-
-	chunk->map[i] = -chunk->map[i];
-	chunk->free_size += chunk->map[i];
-
-	/* merge with previous? */
-	if (i > 0 && chunk->map[i - 1] >= 0) {
-		chunk->map[i - 1] += chunk->map[i];
-		chunk->map_used--;
-		memmove(&chunk->map[i], &chunk->map[i + 1],
-			(chunk->map_used - i) * sizeof(chunk->map[0]));
-		i--;
-	}
-	/* merge with next? */
-	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
-		chunk->map[i] += chunk->map[i + 1];
-		chunk->map_used--;
-		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
-			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
-	}
-
-	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
-	pcpu_chunk_relocate(chunk, oslot);
-}
-
-/**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
- */
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
-		       bool flush)
-{
-	unsigned int last = num_possible_cpus() - 1;
-	unsigned int cpu;
-
-	/* unmap must not be done on immutable chunk */
-	WARN_ON(chunk->immutable);
-
-	/*
-	 * Each flushing trial can be very expensive, issue flush on
-	 * the whole region at once rather than doing it for each cpu.
-	 * This could be an overkill but is more scalable.
-	 */
-	if (flush)
-		flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
-				   pcpu_chunk_addr(chunk, last, page_end));
-
-	for_each_possible_cpu(cpu)
-		unmap_kernel_range_noflush(
-				pcpu_chunk_addr(chunk, cpu, page_start),
-				(page_end - page_start) << PAGE_SHIFT);
-
-	/* ditto as flush_cache_vunmap() */
-	if (flush)
-		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
-				       pcpu_chunk_addr(chunk, last, page_end));
-}
-
-/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk.  If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
-				  bool flush)
-{
-	int page_start = PFN_DOWN(off);
-	int page_end = PFN_UP(off + size);
-	int unmap_start = -1;
-	int uninitialized_var(unmap_end);
-	unsigned int cpu;
-	int i;
-
-	for (i = page_start; i < page_end; i++) {
-		for_each_possible_cpu(cpu) {
-			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
-
-			if (!*pagep)
-				continue;
-
-			__free_page(*pagep);
-
-			/*
-			 * If it's partial depopulation, it might get
-			 * populated or depopulated again.  Mark the
-			 * page gone.
-			 */
-			*pagep = NULL;
-
-			unmap_start = unmap_start < 0 ? i : unmap_start;
-			unmap_end = i + 1;
-		}
-	}
-
-	if (unmap_start >= 0)
-		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
-}
-
-/**
- * pcpu_map - map pages into a pcpu_chunk
- * @chunk: chunk of interest
- * @page_start: page index of the first page to map
- * @page_end: page index of the last page to map + 1
- *
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
- */
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
-{
-	unsigned int last = num_possible_cpus() - 1;
-	unsigned int cpu;
-	int err;
-
-	/* map must not be done on immutable chunk */
-	WARN_ON(chunk->immutable);
-
-	for_each_possible_cpu(cpu) {
-		err = map_kernel_range_noflush(
-				pcpu_chunk_addr(chunk, cpu, page_start),
-				(page_end - page_start) << PAGE_SHIFT,
-				PAGE_KERNEL,
-				pcpu_chunk_pagep(chunk, cpu, page_start));
-		if (err < 0)
-			return err;
-	}
-
-	/* flush at once, please read comments in pcpu_unmap() */
-	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
-			 pcpu_chunk_addr(chunk, last, page_end));
-	return 0;
-}
-
-/**
- * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
- * @chunk: chunk of interest
- * @off: offset to the area to populate
- * @size: size of the area to populate in bytes
- *
- * For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk.  The area is cleared on return.
- */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
-	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
-	int page_start = PFN_DOWN(off);
-	int page_end = PFN_UP(off + size);
-	int map_start = -1;
-	int uninitialized_var(map_end);
-	unsigned int cpu;
-	int i;
-
-	for (i = page_start; i < page_end; i++) {
-		if (pcpu_chunk_page_occupied(chunk, i)) {
-			if (map_start >= 0) {
-				if (pcpu_map(chunk, map_start, map_end))
-					goto err;
-				map_start = -1;
-			}
-			continue;
-		}
-
-		map_start = map_start < 0 ? i : map_start;
-		map_end = i + 1;
-
-		for_each_possible_cpu(cpu) {
-			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
-
-			*pagep = alloc_pages_node(cpu_to_node(cpu),
-						  alloc_mask, 0);
-			if (!*pagep)
-				goto err;
-		}
-	}
-
-	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
-		goto err;
-
-	for_each_possible_cpu(cpu)
-		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
-		       size);
-
-	return 0;
-err:
-	/* likely under heavy memory pressure, give memory back */
-	pcpu_depopulate_chunk(chunk, off, size, true);
-	return -ENOMEM;
-}
-
-static void free_pcpu_chunk(struct pcpu_chunk *chunk)
-{
-	if (!chunk)
-		return;
-	if (chunk->vm)
-		free_vm_area(chunk->vm);
-	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
-	kfree(chunk);
-}
-
-static struct pcpu_chunk *alloc_pcpu_chunk(void)
-{
-	struct pcpu_chunk *chunk;
-
-	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
-	if (!chunk)
-		return NULL;
-
-	chunk->map = pcpu_realloc(NULL, 0,
-				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-	chunk->map[chunk->map_used++] = pcpu_unit_size;
-
-	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
-	if (!chunk->vm) {
-		free_pcpu_chunk(chunk);
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&chunk->list);
-	chunk->free_size = pcpu_unit_size;
-	chunk->contig_hint = pcpu_unit_size;
-
-	return chunk;
-}
-
-/**
- * __alloc_percpu - allocate percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
- */
-void *__alloc_percpu(size_t size, size_t align)
-{
-	void *ptr = NULL;
-	struct pcpu_chunk *chunk;
-	int slot, off;
-
-	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
-		WARN(true, "illegal size (%zu) or align (%zu) for "
-		     "percpu allocation\n", size, align);
-		return NULL;
-	}
-
-	mutex_lock(&pcpu_mutex);
-
-	/* allocate area */
-	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
-		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
-			if (size > chunk->contig_hint)
-				continue;
-			off = pcpu_alloc_area(chunk, size, align);
-			if (off >= 0)
-				goto area_found;
-			if (off != -ENOSPC)
-				goto out_unlock;
-		}
-	}
-
-	/* hmmm... no space left, create a new chunk */
-	chunk = alloc_pcpu_chunk();
-	if (!chunk)
-		goto out_unlock;
-	pcpu_chunk_relocate(chunk, -1);
-	pcpu_chunk_addr_insert(chunk);
-
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
-
-area_found:
-	/* populate, map and clear the area */
-	if (pcpu_populate_chunk(chunk, off, size)) {
-		pcpu_free_area(chunk, off);
-		goto out_unlock;
-	}
-
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-
-static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
-{
-	WARN_ON(chunk->immutable);
-	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
-	list_del(&chunk->list);
-	rb_erase(&chunk->rb_node, &pcpu_addr_root);
-	free_pcpu_chunk(chunk);
-}
-
-/**
- * free_percpu - free percpu area
- * @ptr: pointer to area to free
- *
- * Free percpu area @ptr.  Might sleep.
- */
-void free_percpu(void *ptr)
-{
-	void *addr = __pcpu_ptr_to_addr(ptr);
-	struct pcpu_chunk *chunk;
-	int off;
-
-	if (!ptr)
-		return;
-
-	mutex_lock(&pcpu_mutex);
-
-	chunk = pcpu_chunk_addr_search(addr);
-	off = addr - chunk->vm->addr;
-
-	pcpu_free_area(chunk, off);
-
-	/* the chunk became fully free, kill one if there are other free ones */
-	if (chunk->free_size == pcpu_unit_size) {
-		struct pcpu_chunk *pos;
-
-		list_for_each_entry(pos,
-				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
-			if (pos != chunk) {
-				pcpu_kill_chunk(pos);
-				break;
-			}
-	}
-
-	mutex_unlock(&pcpu_mutex);
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-
-/**
- * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
- * @static_size: the size of static percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @free_size: free size in bytes, 0 for auto
- * @base_addr: mapped address, NULL for auto
- * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
- *
- * Initialize the first percpu chunk which contains the kernel static
- * perpcu area.  This function is to be called from arch percpu area
- * setup path.  The first two parameters are mandatory.  The rest are
- * optional.
- *
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number.  It should at least return enough pages to
- * cover the static area.  The returned pages for static area should
- * have been initialized with valid data.  If @unit_size is specified,
- * it can also return pages after the static area.  NULL return
- * indicates end of pages for the cpu.  Note that @get_page_fn() must
- * return the same number of pages for all cpus.
- *
- * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
- *
- * @free_size determines the number of free bytes after the static
- * area in the first chunk.  If zero, whatever left is available.
- * Specifying non-zero value make percpu leave the area after
- * @static_size + @free_size alone.
- *
- * Non-null @base_addr means that the caller already allocated virtual
- * region for the first chunk and mapped it.  percpu must not mess
- * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
- * @populate_pte_fn doesn't make any sense.
- *
- * @populate_pte_fn is used to populate the pagetable.  NULL means the
- * caller already populated the pagetable.
- *
- * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access.
- */
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t unit_size,
-				     size_t free_size, void *base_addr,
-				     pcpu_populate_pte_fn_t populate_pte_fn)
-{
-	static struct vm_struct static_vm;
-	struct pcpu_chunk *static_chunk;
-	unsigned int cpu;
-	int nr_pages;
-	int err, i;
-
-	/* santiy checks */
-	BUG_ON(!static_size);
-	BUG_ON(!unit_size && free_size);
-	BUG_ON(unit_size && unit_size < static_size + free_size);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(base_addr && !unit_size);
-	BUG_ON(base_addr && populate_pte_fn);
-
-	if (unit_size)
-		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
-	else
-		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size));
-
-	pcpu_static_size = static_size;
-	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
-	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
-		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
-
-	/*
-	 * Allocate chunk slots.  The additional last slot is for
-	 * empty chunks.
-	 */
-	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
-	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
-	for (i = 0; i < pcpu_nr_slots; i++)
-		INIT_LIST_HEAD(&pcpu_slot[i]);
-
-	/* init static_chunk */
-	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
-	INIT_LIST_HEAD(&static_chunk->list);
-	static_chunk->vm = &static_vm;
-
-	if (free_size)
-		static_chunk->free_size = free_size;
-	else
-		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
-
-	static_chunk->contig_hint = static_chunk->free_size;
-
-	/* allocate vm address */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
-
-	if (!base_addr)
-		vm_area_register_early(&static_vm, PAGE_SIZE);
-	else {
-		/*
-		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the static chunk can't
-		 * be mapped or unmapped by percpu and is marked
-		 * immutable.
-		 */
-		static_vm.addr = base_addr;
-		static_chunk->immutable = true;
-	}
-
-	/* assign pages */
-	nr_pages = -1;
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < pcpu_unit_pages; i++) {
-			struct page *page = get_page_fn(cpu, i);
-
-			if (!page)
-				break;
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
-		}
-
-		BUG_ON(i < PFN_UP(pcpu_static_size));
-
-		if (nr_pages < 0)
-			nr_pages = i;
-		else
-			BUG_ON(nr_pages != i);
-	}
-
-	/* map them */
-	if (populate_pte_fn) {
-		for_each_possible_cpu(cpu)
-			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(static_chunk,
-								cpu, i));
-
-		err = pcpu_map(static_chunk, 0, nr_pages);
-		if (err)
-			panic("failed to setup static percpu area, err=%d\n",
-			      err);
-	}
-
-	/* link static_chunk in */
-	pcpu_chunk_relocate(static_chunk, -1);
-	pcpu_chunk_addr_insert(static_chunk);
-
-	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
-	return pcpu_unit_size;
-}
diff --git a/trunk/mm/vmalloc.c b/trunk/mm/vmalloc.c
index af58324c361a..11a929872ebd 100644
--- a/trunk/mm/vmalloc.c
+++ b/trunk/mm/vmalloc.c
@@ -24,7 +24,6 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
-#include <linux/pfn.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -153,8 +152,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
  *
  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  */
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
-				   pgprot_t prot, struct page **pages)
+static int vmap_page_range(unsigned long start, unsigned long end,
+				pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -170,22 +169,13 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	flush_cache_vmap(start, end);
 
 	if (unlikely(err))
 		return err;
 	return nr;
 }
 
-static int vmap_page_range(unsigned long start, unsigned long end,
-			   pgprot_t prot, struct page **pages)
-{
-	int ret;
-
-	ret = vmap_page_range_noflush(start, end, prot, pages);
-	flush_cache_vmap(start, end);
-	return ret;
-}
-
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -1000,32 +990,6 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
 
-/**
- * vm_area_register_early - register vmap area early during boot
- * @vm: vm_struct to register
- * @align: requested alignment
- *
- * This function is used to register kernel vm area before
- * vmalloc_init() is called.  @vm->size and @vm->flags should contain
- * proper values on entry and other fields should be zero.  On return,
- * vm->addr contains the allocated address.
- *
- * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
- */
-void __init vm_area_register_early(struct vm_struct *vm, size_t align)
-{
-	static size_t vm_init_off __initdata;
-	unsigned long addr;
-
-	addr = ALIGN(VMALLOC_START + vm_init_off, align);
-	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
-
-	vm->addr = (void *)addr;
-
-	vm->next = vmlist;
-	vmlist = vm;
-}
-
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
@@ -1053,58 +1017,6 @@ void __init vmalloc_init(void)
 	vmap_initialized = true;
 }
 
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
-			     pgprot_t prot, struct page **pages)
-{
-	return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
- * before calling this function and flush_tlb_kernel_range() after.
- */
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
-	vunmap_page_range(addr, addr + size);
-}
-
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
diff --git a/trunk/net/ipv4/af_inet.c b/trunk/net/ipv4/af_inet.c
index 3a3dad801354..743f5542d65a 100644
--- a/trunk/net/ipv4/af_inet.c
+++ b/trunk/net/ipv4/af_inet.c
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
 int snmp_mib_init(void *ptr[2], size_t mibsize)
 {
 	BUG_ON(ptr == NULL);
-	ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
+	ptr[0] = __alloc_percpu(mibsize);
 	if (!ptr[0])
 		goto err0;
-	ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
+	ptr[1] = __alloc_percpu(mibsize);
 	if (!ptr[1])
 		goto err1;
 	return 0;
diff --git a/trunk/net/ipv4/route.c b/trunk/net/ipv4/route.c
index bf895401218f..97f71153584f 100644
--- a/trunk/net/ipv4/route.c
+++ b/trunk/net/ipv4/route.c
@@ -3376,7 +3376,7 @@ int __init ip_rt_init(void)
 	int rc = 0;
 
 #ifdef CONFIG_NET_CLS_ROUTE
-	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
+	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
 	if (!ip_rt_acct)
 		panic("IP: failed to allocate ip_rt_acct\n");
 #endif