From 4d4e4db03669c0331637e2fe96d31d2937ffe4b8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 17 Sep 2005 14:41:40 +1000
Subject: [PATCH] --- yaml --- r: 11400 b: refs/heads/master c:
 d32311fed70d12f14e585feb4653571b1e2b0e6d h: refs/heads/master v: v3

---
 [refs]                                      |   2 +-
 trunk/Documentation/cachetlb.txt            |   9 +
 trunk/Documentation/kernel-parameters.txt   |   2 +
 trunk/Documentation/m68k/kernel-options.txt |  24 +-
 trunk/arch/alpha/mm/numa.c                  |   3 -
 trunk/arch/alpha/mm/remap.c                 |   6 +-
 trunk/arch/arm/kernel/signal.c              |  96 +-
 trunk/arch/arm/kernel/traps.c               |  14 +-
 trunk/arch/arm/mm/consistent.c              |   6 +-
 trunk/arch/arm/mm/fault-armv.c              |   7 +-
 trunk/arch/arm/mm/ioremap.c                 |   4 +-
 trunk/arch/arm/mm/mm-armv.c                 |  15 +-
 trunk/arch/arm/oprofile/backtrace.c         |  46 +-
 trunk/arch/arm26/mm/memc.c                  |  18 +-
 trunk/arch/cris/arch-v32/mm/tlb.c           |   6 +-
 trunk/arch/cris/mm/ioremap.c                |   4 +-
 trunk/arch/frv/mm/dma-alloc.c               |   5 +-
 trunk/arch/frv/mm/pgalloc.c                 |   4 +-
 trunk/arch/i386/kernel/vm86.c               |  17 +-
 trunk/arch/i386/mm/discontig.c              |   4 +-
 trunk/arch/i386/mm/init.c                   |  62 +-
 trunk/arch/i386/mm/ioremap.c                |   4 +-
 trunk/arch/i386/mm/pgtable.c                |  11 +-
 trunk/arch/i386/oprofile/backtrace.c        |  38 +-
 trunk/arch/ia64/kernel/perfmon.c            |   3 +-
 trunk/arch/ia64/mm/discontig.c              |   7 +-
 trunk/arch/ia64/mm/fault.c                  |  34 +-
 trunk/arch/ia64/mm/init.c                   |  13 +-
 trunk/arch/ia64/mm/tlb.c                    |   2 -
 trunk/arch/m32r/mm/init.c                   |   9 +-
 trunk/arch/m32r/mm/ioremap.c                |   4 +-
 trunk/arch/m68k/Kconfig                     |  24 +-
 trunk/arch/m68k/atari/stram.c               | 918 +++++++++++++++++-
 trunk/arch/m68k/mm/kmap.c                   |   2 +-
 trunk/arch/m68k/sun3x/dvma.c                |   2 +-
 trunk/arch/mips/kernel/irixelf.c            |   1 +
 trunk/arch/mips/mm/ioremap.c                |   4 +-
 trunk/arch/parisc/kernel/cache.c            |  24 +-
 trunk/arch/parisc/kernel/pci-dma.c          |   2 +-
 trunk/arch/parisc/mm/init.c                 |   3 -
 trunk/arch/parisc/mm/ioremap.c              |   6 +-
 trunk/arch/ppc/kernel/dma-mapping.c         |   6 +-
 trunk/arch/ppc/mm/4xx_mmu.c                 |   4 +
 trunk/arch/ppc/mm/pgtable.c                 |   4 +-
 trunk/arch/ppc64/kernel/vdso.c              |  12 +-
 trunk/arch/ppc64/mm/imalloc.c               |   5 +
 trunk/arch/ppc64/mm/init.c                  |  87 +-
 trunk/arch/s390/mm/ioremap.c                |   4 +-
 trunk/arch/sh/mm/fault.c                    |  40 +-
 trunk/arch/sh/mm/hugetlbpage.c              |   2 +
 trunk/arch/sh/mm/ioremap.c                  |   4 +-
 trunk/arch/sh64/mm/cache.c                  |  68 +-
 trunk/arch/sh64/mm/hugetlbpage.c            | 188 +++-
 trunk/arch/sh64/mm/ioremap.c                |   4 +-
 trunk/arch/sparc/mm/generic.c               |   7 +-
 trunk/arch/sparc64/kernel/binfmt_aout32.c   |   1 +
 trunk/arch/sparc64/mm/generic.c             |   9 +-
 trunk/arch/sparc64/mm/tlb.c                 |   7 +-
 trunk/arch/um/include/tlb.h                 |   1 +
 trunk/arch/um/kernel/process_kern.c         |   8 +-
 trunk/arch/um/kernel/skas/mmu.c             |   4 +-
 trunk/arch/um/kernel/tt/tlb.c               |  36 +
 trunk/arch/x86_64/ia32/ia32_aout.c          |   1 +
 trunk/arch/x86_64/mm/ioremap.c              |   4 +-
 trunk/drivers/acpi/acpi_memhotplug.c        |   5 +-
 trunk/drivers/base/Makefile                 |   1 -
 trunk/drivers/base/init.c                   |   2 -
 trunk/drivers/base/memory.c                 | 452 ---------
 trunk/drivers/scsi/sg.c                     |  12 +-
 trunk/drivers/scsi/st.c                     |  10 +-
 trunk/fs/afs/file.c                         |   4 +-
 trunk/fs/binfmt_aout.c                      |   1 +
 trunk/fs/binfmt_elf.c                       |   1 +
 trunk/fs/binfmt_elf_fdpic.c                 |   7 +
 trunk/fs/binfmt_flat.c                      |   1 +
 trunk/fs/binfmt_som.c                       |   1 +
 trunk/fs/buffer.c                           |   2 +-
 trunk/fs/compat.c                           |   1 +
 trunk/fs/direct-io.c                        |   4 +-
 trunk/fs/exec.c                             |  17 +-
 trunk/fs/hugetlbfs/inode.c                  | 206 ++--
 trunk/fs/jfs/jfs_metapage.c                 |  12 +-
 trunk/fs/proc/array.c                       |   2 +-
 trunk/fs/proc/task_mmu.c                    |  51 +-
 trunk/fs/xfs/linux-2.6/xfs_buf.c            |   7 +-
 trunk/include/asm-alpha/barrier.h           |   2 -
 trunk/include/asm-alpha/rwsem.h             |   5 -
 trunk/include/asm-arm/tlb.h                 |  23 +-
 trunk/include/asm-arm26/tlb.h               |  47 +-
 trunk/include/asm-generic/4level-fixup.h    |  11 +-
 trunk/include/asm-generic/pgtable.h         |   2 +-
 trunk/include/asm-generic/tlb.h             |  23 +-
 trunk/include/asm-i386/mmzone.h             |   6 +
 trunk/include/asm-i386/pgtable.h            |   3 +-
 trunk/include/asm-i386/rwsem.h              |   5 -
 trunk/include/asm-ia64/rwsem.h              |   5 -
 trunk/include/asm-ia64/tlb.h                |  19 +-
 trunk/include/asm-m32r/mmzone.h             |   6 +
 trunk/include/asm-parisc/cacheflush.h       |  35 +-
 trunk/include/asm-parisc/mmzone.h           |   6 +
 trunk/include/asm-parisc/tlbflush.h         |   3 +-
 trunk/include/asm-ppc/rwsem.h               |   5 -
 trunk/include/asm-ppc64/mmzone.h            |   3 +
 trunk/include/asm-ppc64/pgtable.h           |   4 +-
 trunk/include/asm-ppc64/rwsem.h             |   5 -
 trunk/include/asm-s390/rwsem.h              |   5 -
 trunk/include/asm-sh/rwsem.h                |   5 -
 trunk/include/asm-sparc64/rwsem.h           |   5 -
 trunk/include/asm-sparc64/tlb.h             |  29 +-
 trunk/include/asm-um/pgtable.h              |   2 +-
 trunk/include/asm-x86_64/rwsem.h            |   5 -
 trunk/include/linux/buffer_head.h           |   6 +-
 trunk/include/linux/hugetlb.h               |   2 +
 trunk/include/linux/memory.h                |  94 --
 trunk/include/linux/memory_hotplug.h        | 104 --
 trunk/include/linux/mempolicy.h             |   7 +-
 trunk/include/linux/mm.h                    | 150 ++-
 trunk/include/linux/mmzone.h                |  28 -
 trunk/include/linux/rmap.h                  |   4 +-
 trunk/include/linux/rwsem-spinlock.h        |   5 -
 trunk/include/linux/scatterlist.h           |  17 +-
 trunk/include/linux/sched.h                 |  65 +-
 trunk/include/linux/vmalloc.h               |   8 +-
 trunk/ipc/shm.c                             |   7 +-
 trunk/kernel/acct.c                         |   2 +-
 trunk/kernel/exit.c                         |   5 +-
 trunk/kernel/fork.c                         |  31 +-
 trunk/kernel/futex.c                        |   6 +-
 trunk/kernel/kexec.c                        |   4 +-
 trunk/kernel/power/swsusp.c                 |  25 +-
 trunk/kernel/sched.c                        |   2 +
 trunk/kernel/timer.c                        |   9 -
 trunk/mm/Kconfig                            |  21 -
 trunk/mm/Makefile                           |   2 +-
 trunk/mm/bootmem.c                          |   1 -
 trunk/mm/filemap.c                          |  12 +-
 trunk/mm/filemap_xip.c                      |  22 +-
 trunk/mm/fremap.c                           |  86 +-
 trunk/mm/hugetlb.c                          | 207 ++--
 trunk/mm/madvise.c                          |   2 +-
 trunk/mm/memory.c                           | 993 ++++++++++----------
 trunk/mm/memory_hotplug.c                   | 138 ---
 trunk/mm/mempolicy.c                        | 393 ++++----
 trunk/mm/mmap.c                             | 126 ++-
 trunk/mm/mprotect.c                         |  19 +-
 trunk/mm/mremap.c                           | 193 ++--
 trunk/mm/msync.c                            |  78 +-
 trunk/mm/nommu.c                            |  18 +-
 trunk/mm/page_alloc.c                       | 207 ++--
 trunk/mm/page_io.c                          |   6 +-
 trunk/mm/rmap.c                             | 146 ++-
 trunk/mm/shmem.c                            |  28 +-
 trunk/mm/slab.c                             |   5 +-
 trunk/mm/sparse.c                           |  99 +-
 trunk/mm/swap.c                             |   6 +-
 trunk/mm/swap_state.c                       |  11 +-
 trunk/mm/swapfile.c                         |  41 +-
 trunk/mm/thrash.c                           |   2 +-
 trunk/mm/vmalloc.c                          |  77 +-
 trunk/mm/vmscan.c                           |   6 +-
 trunk/sound/core/pcm_native.c               |   9 +-
 161 files changed, 3352 insertions(+), 3222 deletions(-)
 delete mode 100644 trunk/drivers/base/memory.c
 delete mode 100644 trunk/include/linux/memory.h
 delete mode 100644 trunk/include/linux/memory_hotplug.h
 delete mode 100644 trunk/mm/memory_hotplug.c

diff --git a/[refs] b/[refs]
index d3ea6aca5ff5..f4dd7eca2155 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: b1459461f1e0abd5c28317d6bff6f2ca612a719d
+refs/heads/master: d32311fed70d12f14e585feb4653571b1e2b0e6d
diff --git a/trunk/Documentation/cachetlb.txt b/trunk/Documentation/cachetlb.txt
index 7eb715e07eda..e132fb1163b0 100644
--- a/trunk/Documentation/cachetlb.txt
+++ b/trunk/Documentation/cachetlb.txt
@@ -49,6 +49,9 @@ changes occur:
 	page table operations such as what happens during
 	fork, and exec.
 
+	Platform developers note that generic code will always
+	invoke this interface without mm->page_table_lock held.
+
 3) void flush_tlb_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 
@@ -69,6 +72,9 @@ changes occur:
 	call flush_tlb_page (see below) for each entry which may be
 	modified.
 
+	Platform developers note that generic code will always
+	invoke this interface with mm->page_table_lock held.
+
 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 
 	This time we need to remove the PAGE_SIZE sized translation
@@ -87,6 +93,9 @@ changes occur:
 
 	This is used primarily during fault processing.
 
+	Platform developers note that generic code will always
+	invoke this interface with mm->page_table_lock held.
+
 5) void flush_tlb_pgtables(struct mm_struct *mm,
 			   unsigned long start, unsigned long end)
 
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
index 5dffcfefc3c7..90766b75d1b7 100644
--- a/trunk/Documentation/kernel-parameters.txt
+++ b/trunk/Documentation/kernel-parameters.txt
@@ -1460,6 +1460,8 @@ running once the system is up.
 	stifb=		[HW]
 			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
 
+	stram_swap=	[HW,M68k]
+
 	swiotlb=	[IA-64] Number of I/O TLB slabs
 
 	switches=	[HW,M68k]
diff --git a/trunk/Documentation/m68k/kernel-options.txt b/trunk/Documentation/m68k/kernel-options.txt
index d5d3f064f552..e191baad8308 100644
--- a/trunk/Documentation/m68k/kernel-options.txt
+++ b/trunk/Documentation/m68k/kernel-options.txt
@@ -626,7 +626,7 @@ ignored (others aren't affected).
     can be performed in optimal order. Not all SCSI devices support
     tagged queuing (:-().
 
-4.5 switches=
+4.6 switches=
 -------------
 
 Syntax: switches=<list of switches>
@@ -661,6 +661,28 @@ correctly.
 earlier initialization ("ov_"-less) takes precedence. But the
 switching-off on reset still happens in this case.
 
+4.5) stram_swap=
+----------------
+
+Syntax: stram_swap=<do_swap>[,<max_swap>]
+
+  This option is available only if the kernel has been compiled with
+CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines
+dynamically whether to actually use ST-RAM as swap space. (Currently,
+the fraction of ST-RAM must be less or equal 1/3 of total memory to
+enable this swapping.) You can override the kernel's decision by
+specifying this option. 1 for <do_swap> means always enable the swap,
+even if you have less alternate RAM. 0 stands for never swap to
+ST-RAM, even if it's small enough compared to the rest of memory.
+
+  If ST-RAM swapping is enabled, the kernel usually uses all free
+ST-RAM as swap "device". If the kernel resides in ST-RAM, the region
+allocated by it is obviously never used for swapping :-) You can also
+limit this amount by specifying the second parameter, <max_swap>, if
+you want to use parts of ST-RAM as normal system memory. <max_swap> is
+in kBytes and the number should be a multiple of 4 (otherwise: rounded
+down).
+
 5) Options for Amiga Only:
 ==========================
 
diff --git a/trunk/arch/alpha/mm/numa.c b/trunk/arch/alpha/mm/numa.c
index 6d5251254f68..c7481d59b6df 100644
--- a/trunk/arch/alpha/mm/numa.c
+++ b/trunk/arch/alpha/mm/numa.c
@@ -371,8 +371,6 @@ show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_node(nid) {
-		unsigned long flags;
-		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		i = node_spanned_pages(nid);
 		while (i-- > 0) {
 			struct page *page = nid_page_nr(nid, i);
@@ -386,7 +384,6 @@ show_mem(void)
 			else
 				shared += page_count(page) - 1;
 		}
-		pgdat_resize_unlock(NODE_DATA(nid), &flags);
 	}
 	printk("%ld pages of RAM\n",total);
 	printk("%ld free pages\n",free);
diff --git a/trunk/arch/alpha/mm/remap.c b/trunk/arch/alpha/mm/remap.c
index a78356c3ead5..19817ad3d89b 100644
--- a/trunk/arch/alpha/mm/remap.c
+++ b/trunk/arch/alpha/mm/remap.c
@@ -2,6 +2,7 @@
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 
+/* called with the page_table_lock held */
 static inline void 
 remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, 
 	       unsigned long phys_addr, unsigned long flags)
@@ -30,6 +31,7 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
 	} while (address && (address < end));
 }
 
+/* called with the page_table_lock held */
 static inline int 
 remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, 
 	       unsigned long phys_addr, unsigned long flags)
@@ -44,7 +46,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, 
@@ -68,6 +70,7 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -81,6 +84,7 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	return error;
 }
 
diff --git a/trunk/arch/arm/kernel/signal.c b/trunk/arch/arm/kernel/signal.c
index a917e3dd3666..a94d75fef598 100644
--- a/trunk/arch/arm/kernel/signal.c
+++ b/trunk/arch/arm/kernel/signal.c
@@ -139,33 +139,93 @@ struct iwmmxt_sigframe {
 	unsigned long	storage[0x98/4];
 };
 
+static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
+{
+	unsigned long addr = (unsigned long)uptr;
+	pgd_t *pgd = pgd_offset(mm, addr);
+	if (pgd_present(*pgd)) {
+		pmd_t *pmd = pmd_offset(pgd, addr);
+		if (pmd_present(*pmd)) {
+			pte_t *pte = pte_offset_map(pmd, addr);
+			return (pte_present(*pte) && (!wr || pte_write(*pte)));
+		}
+	}
+	return 0;
+}
+
+static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
+		       void (*copyfn)(void *, void __user *))
+{
+	unsigned char v, __user *userptr = uptr;
+	int err = 0;
+
+	do {
+		struct mm_struct *mm;
+
+		if (write) {
+			__put_user_error(0, userptr, err);
+			__put_user_error(0, userptr + size - 1, err);
+		} else {
+			__get_user_error(v, userptr, err);
+			__get_user_error(v, userptr + size - 1, err);
+		}
+
+		if (err)
+			break;
+
+		mm = current->mm;
+		spin_lock(&mm->page_table_lock);
+		if (page_present(mm, userptr, write) &&
+		    page_present(mm, userptr + size - 1, write)) {
+		    	copyfn(kptr, uptr);
+		} else
+			err = 1;
+		spin_unlock(&mm->page_table_lock);
+	} while (err);
+
+	return err;
+}
+
 static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-	char kbuf[sizeof(*frame) + 8];
-	struct iwmmxt_sigframe *kframe;
+	int err = 0;
 
 	/* the iWMMXt context must be 64 bit aligned */
-	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
-	kframe->magic0 = IWMMXT_MAGIC0;
-	kframe->magic1 = IWMMXT_MAGIC1;
-	iwmmxt_task_copy(current_thread_info(), &kframe->storage);
-	return __copy_to_user(frame, kframe, sizeof(*frame));
+	WARN_ON((unsigned long)frame & 7);
+
+	__put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);
+	__put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);
+
+	/*
+	 * iwmmxt_task_copy() doesn't check user permissions.
+	 * Let's do a dummy write on the upper boundary to ensure
+	 * access to user mem is OK all way up.
+	 */
+	err |= copy_locked(&frame->storage, current_thread_info(),
+			   sizeof(frame->storage), 1, iwmmxt_task_copy);
+	return err;
 }
 
 static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-	char kbuf[sizeof(*frame) + 8];
-	struct iwmmxt_sigframe *kframe;
+	unsigned long magic0, magic1;
+	int err = 0;
 
-	/* the iWMMXt context must be 64 bit aligned */
-	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
-	if (__copy_from_user(kframe, frame, sizeof(*frame)))
-		return -1;
-	if (kframe->magic0 != IWMMXT_MAGIC0 ||
-	    kframe->magic1 != IWMMXT_MAGIC1)
-		return -1;
-	iwmmxt_task_restore(current_thread_info(), &kframe->storage);
-	return 0;
+	/* the iWMMXt context is 64 bit aligned */
+	WARN_ON((unsigned long)frame & 7);
+
+	/*
+	 * Validate iWMMXt context signature.
+	 * Also, iwmmxt_task_restore() doesn't check user permissions.
+	 * Let's do a dummy write on the upper boundary to ensure
+	 * access to user mem is OK all way up.
+	 */
+	__get_user_error(magic0, &frame->magic0, err);
+	__get_user_error(magic1, &frame->magic1, err);
+	if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
+		err = copy_locked(&frame->storage, current_thread_info(),
+				  sizeof(frame->storage), 0, iwmmxt_task_restore);
+	return err;
 }
 
 #endif
diff --git a/trunk/arch/arm/kernel/traps.c b/trunk/arch/arm/kernel/traps.c
index 66e5a0516f23..baa09601a64e 100644
--- a/trunk/arch/arm/kernel/traps.c
+++ b/trunk/arch/arm/kernel/traps.c
@@ -483,33 +483,29 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		unsigned long addr = regs->ARM_r2;
 		struct mm_struct *mm = current->mm;
 		pgd_t *pgd; pmd_t *pmd; pte_t *pte;
-		spinlock_t *ptl;
 
 		regs->ARM_cpsr &= ~PSR_C_BIT;
-		down_read(&mm->mmap_sem);
+		spin_lock(&mm->page_table_lock);
 		pgd = pgd_offset(mm, addr);
 		if (!pgd_present(*pgd))
 			goto bad_access;
 		pmd = pmd_offset(pgd, addr);
 		if (!pmd_present(*pmd))
 			goto bad_access;
-		pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-		if (!pte_present(*pte) || !pte_write(*pte)) {
-			pte_unmap_unlock(pte, ptl);
+		pte = pte_offset_map(pmd, addr);
+		if (!pte_present(*pte) || !pte_write(*pte))
 			goto bad_access;
-		}
 		val = *(unsigned long *)addr;
 		val -= regs->ARM_r0;
 		if (val == 0) {
 			*(unsigned long *)addr = regs->ARM_r1;
 			regs->ARM_cpsr |= PSR_C_BIT;
 		}
-		pte_unmap_unlock(pte, ptl);
-		up_read(&mm->mmap_sem);
+		spin_unlock(&mm->page_table_lock);
 		return val;
 
 		bad_access:
-		up_read(&mm->mmap_sem);
+		spin_unlock(&mm->page_table_lock);
 		/* simulate a write access fault */
 		do_DataAbort(addr, 15 + (1 << 11), regs);
 		return -1;
diff --git a/trunk/arch/arm/mm/consistent.c b/trunk/arch/arm/mm/consistent.c
index 47b0b767f080..82f4d5e27c54 100644
--- a/trunk/arch/arm/mm/consistent.c
+++ b/trunk/arch/arm/mm/consistent.c
@@ -397,6 +397,8 @@ static int __init consistent_init(void)
 	pte_t *pte;
 	int ret = 0;
 
+	spin_lock(&init_mm.page_table_lock);
+
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -407,7 +409,7 @@ static int __init consistent_init(void)
 		}
 		WARN_ON(!pmd_none(*pmd));
 
-		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
+		pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
 			ret = -ENOMEM;
@@ -417,6 +419,8 @@ static int __init consistent_init(void)
 		consistent_pte = pte;
 	} while (0);
 
+	spin_unlock(&init_mm.page_table_lock);
+
 	return ret;
 }
 
diff --git a/trunk/arch/arm/mm/fault-armv.c b/trunk/arch/arm/mm/fault-armv.c
index 7fc1b35a6746..be4ab3d73c91 100644
--- a/trunk/arch/arm/mm/fault-armv.c
+++ b/trunk/arch/arm/mm/fault-armv.c
@@ -26,11 +26,6 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
 /*
  * We take the easy way out of this problem - we make the
  * PTE uncacheable.  However, we leave the write buffer on.
- *
- * Note that the pte lock held when calling update_mmu_cache must also
- * guard the pte (somewhere else in the same mm) that we modify here.
- * Therefore those configurations which might call adjust_pte (those
- * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 {
@@ -132,7 +127,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page);
  *  2. If we have multiple shared mappings of the same space in
  *     an object, we need to deal with the cache aliasing issues.
  *
- * Note that the pte lock will be held.
+ * Note that the page_table_lock will be held.
  */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
diff --git a/trunk/arch/arm/mm/ioremap.c b/trunk/arch/arm/mm/ioremap.c
index 0f128c28fee4..6fb1258df1b5 100644
--- a/trunk/arch/arm/mm/ioremap.c
+++ b/trunk/arch/arm/mm/ioremap.c
@@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 
 	pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags);
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, pgprot);
@@ -97,6 +97,7 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
 	phys_addr -= address;
 	dir = pgd_offset(&init_mm, address);
 	BUG_ON(address >= end);
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		if (!pmd) {
@@ -113,6 +114,7 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
 		dir++;
 	} while (address && (address < end));
 
+	spin_unlock(&init_mm.page_table_lock);
 	flush_cache_vmap(start, end);
 	return err;
 }
diff --git a/trunk/arch/arm/mm/mm-armv.c b/trunk/arch/arm/mm/mm-armv.c
index 1221fdde1769..61bc2fa0511e 100644
--- a/trunk/arch/arm/mm/mm-armv.c
+++ b/trunk/arch/arm/mm/mm-armv.c
@@ -179,6 +179,11 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));
 
 	if (!vectors_high()) {
+		/*
+		 * This lock is here just to satisfy pmd_alloc and pte_lock
+		 */
+		spin_lock(&mm->page_table_lock);
+
 		/*
 		 * On ARM, first page must always be allocated since it
 		 * contains the machine vectors.
@@ -196,14 +201,23 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 		set_pte(new_pte, *init_pte);
 		pte_unmap_nested(init_pte);
 		pte_unmap(new_pte);
+
+		spin_unlock(&mm->page_table_lock);
 	}
 
 	return new_pgd;
 
 no_pte:
+	spin_unlock(&mm->page_table_lock);
 	pmd_free(new_pmd);
+	free_pages((unsigned long)new_pgd, 2);
+	return NULL;
+
 no_pmd:
+	spin_unlock(&mm->page_table_lock);
 	free_pages((unsigned long)new_pgd, 2);
+	return NULL;
+
 no_pgd:
 	return NULL;
 }
@@ -229,7 +243,6 @@ void free_pgd_slow(pgd_t *pgd)
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
 	dec_page_state(nr_page_table_pages);
-	pte_lock_deinit(pte);
 	pte_free(pte);
 	pmd_free(pmd);
 free:
diff --git a/trunk/arch/arm/oprofile/backtrace.c b/trunk/arch/arm/oprofile/backtrace.c
index 7c22c12618cc..df35c452a8bf 100644
--- a/trunk/arch/arm/oprofile/backtrace.c
+++ b/trunk/arch/arm/oprofile/backtrace.c
@@ -49,22 +49,42 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail)
 
 static struct frame_tail* user_backtrace(struct frame_tail *tail)
 {
-	struct frame_tail buftail[2];
+	struct frame_tail buftail;
 
-	/* Also check accessibility of one struct frame_tail beyond */
-	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
-		return NULL;
-	if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail)))
+	/* hardware pte might not be valid due to dirty/accessed bit emulation
+	 * so we use copy_from_user and benefit from exception fixups */
+	if (copy_from_user(&buftail, tail, sizeof(struct frame_tail)))
 		return NULL;
 
-	oprofile_add_trace(buftail[0].lr);
+	oprofile_add_trace(buftail.lr);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (tail >= buftail[0].fp)
+	if (tail >= buftail.fp)
 		return NULL;
 
-	return buftail[0].fp-1;
+	return buftail.fp-1;
+}
+
+/* Compare two addresses and see if they're on the same page */
+#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \
+	== ((((unsigned long) y) + offset) >> PAGE_SHIFT))
+
+/* check that the page(s) containing the frame tail are present */
+static int pages_present(struct frame_tail *tail)
+{
+	struct mm_struct * mm = current->mm;
+
+	if (!check_user_page_readable(mm, (unsigned long)tail))
+		return 0;
+
+	if (CMP_ADDR_EQUAL(tail, tail, 8))
+		return 1;
+
+	if (!check_user_page_readable(mm, ((unsigned long)tail) + 8))
+		return 0;
+
+	return 1;
 }
 
 /*
@@ -98,6 +118,7 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs)
 void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
 	struct frame_tail *tail;
+	unsigned long last_address = 0;
 
 	tail = ((struct frame_tail *) regs->ARM_fp) - 1;
 
@@ -111,6 +132,13 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
-	while (depth-- && tail && !((unsigned long) tail & 3))
+	while (depth-- && tail && !((unsigned long) tail & 3)) {
+		if ((!CMP_ADDR_EQUAL(last_address, tail, 0)
+			|| !CMP_ADDR_EQUAL(last_address, tail, 8))
+				&& !pages_present(tail))
+			return;
+		last_address = (unsigned long) tail;
 		tail = user_backtrace(tail);
+	}
 }
+
diff --git a/trunk/arch/arm26/mm/memc.c b/trunk/arch/arm26/mm/memc.c
index 34def6397c3c..8e8a2bb2487d 100644
--- a/trunk/arch/arm26/mm/memc.c
+++ b/trunk/arch/arm26/mm/memc.c
@@ -78,6 +78,12 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	if (!new_pgd)
 		goto no_pgd;
 
+	/*
+	 * This lock is here just to satisfy pmd_alloc and pte_lock
+         * FIXME: I bet we could avoid taking it pretty much altogether
+	 */
+	spin_lock(&mm->page_table_lock);
+
 	/*
 	 * On ARM, first page must always be allocated since it contains
 	 * the machine vectors.
@@ -86,7 +92,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	if (!new_pmd)
 		goto no_pmd;
 
-	new_pte = pte_alloc_map(mm, new_pmd, 0);
+	new_pte = pte_alloc_kernel(mm, new_pmd, 0);
 	if (!new_pte)
 		goto no_pte;
 
@@ -95,7 +101,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	init_pte = pte_offset(init_pmd, 0);
 
 	set_pte(new_pte, *init_pte);
-	pte_unmap(new_pte);
 
 	/*
 	 * the page table entries are zeroed
@@ -107,14 +112,23 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
 		(PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
 
+	spin_unlock(&mm->page_table_lock);
+
 	/* update MEMC tables */
 	cpu_memc_update_all(new_pgd);
 	return new_pgd;
 
 no_pte:
+	spin_unlock(&mm->page_table_lock);
 	pmd_free(new_pmd);
+	free_pgd_slow(new_pgd);
+	return NULL;
+
 no_pmd:
+	spin_unlock(&mm->page_table_lock);
 	free_pgd_slow(new_pgd);
+	return NULL;
+
 no_pgd:
 	return NULL;
 }
diff --git a/trunk/arch/cris/arch-v32/mm/tlb.c b/trunk/arch/cris/arch-v32/mm/tlb.c
index b08a28bb58ab..8233406798d3 100644
--- a/trunk/arch/cris/arch-v32/mm/tlb.c
+++ b/trunk/arch/cris/arch-v32/mm/tlb.c
@@ -175,8 +175,6 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
-static DEFINE_SPINLOCK(mmu_context_lock);
-
 /* Called in schedule() just before actually doing the switch_to. */
 void
 switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -185,10 +183,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	int cpu = smp_processor_id();
 
 	/* Make sure there is a MMU context. */
-	spin_lock(&mmu_context_lock);
+	spin_lock(&next->page_table_lock);
 	get_mmu_context(next);
 	cpu_set(cpu, next->cpu_vm_mask);
-	spin_unlock(&mmu_context_lock);
+	spin_unlock(&next->page_table_lock);
 
 	/*
 	 * Remember the pgd for the fault handlers. Keep a seperate copy of it
diff --git a/trunk/arch/cris/mm/ioremap.c b/trunk/arch/cris/mm/ioremap.c
index a92ac9877582..ebba11e270fa 100644
--- a/trunk/arch/cris/mm/ioremap.c
+++ b/trunk/arch/cris/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, prot);
@@ -74,6 +74,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pmd_t *pmd;
@@ -93,6 +94,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/trunk/arch/frv/mm/dma-alloc.c b/trunk/arch/frv/mm/dma-alloc.c
index 342823aad758..cfc4f97490c6 100644
--- a/trunk/arch/frv/mm/dma-alloc.c
+++ b/trunk/arch/frv/mm/dma-alloc.c
@@ -55,18 +55,21 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot)
 	pte_t *pte;
 	int err = -ENOMEM;
 
+	spin_lock(&init_mm.page_table_lock);
+
 	/* Use upper 10 bits of VA to index the first level map */
 	pge = pgd_offset_k(va);
 	pue = pud_offset(pge, va);
 	pme = pmd_offset(pue, va);
 
 	/* Use middle 10 bits of VA to index the second-level map */
-	pte = pte_alloc_kernel(pme, va);
+	pte = pte_alloc_kernel(&init_mm, pme, va);
 	if (pte != 0) {
 		err = 0;
 		set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot));
 	}
 
+	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 
diff --git a/trunk/arch/frv/mm/pgalloc.c b/trunk/arch/frv/mm/pgalloc.c
index 2c67dfe5a6b3..4eaec0f3525b 100644
--- a/trunk/arch/frv/mm/pgalloc.c
+++ b/trunk/arch/frv/mm/pgalloc.c
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
 	if (pgd_list)
 		pgd_list->private = (unsigned long) &page->index;
 	pgd_list = page;
-	set_page_private(page, (unsigned long)&pgd_list);
+	page->private = (unsigned long) &pgd_list;
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *) page->index;
-	pprev = (struct page **)page_private(page);
+	pprev = (struct page **) page->private;
 	*pprev = next;
 	if (next)
 		next->private = (unsigned long) pprev;
diff --git a/trunk/arch/i386/kernel/vm86.c b/trunk/arch/i386/kernel/vm86.c
index fc1993564f98..16b485009622 100644
--- a/trunk/arch/i386/kernel/vm86.c
+++ b/trunk/arch/i386/kernel/vm86.c
@@ -134,16 +134,17 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 	return ret;
 }
 
-static void mark_screen_rdonly(struct mm_struct *mm)
+static void mark_screen_rdonly(struct task_struct * tsk)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
-	spinlock_t *ptl;
+	pte_t *pte, *mapped;
 	int i;
 
-	pgd = pgd_offset(mm, 0xA0000);
+	preempt_disable();
+	spin_lock(&tsk->mm->page_table_lock);
+	pgd = pgd_offset(tsk->mm, 0xA0000);
 	if (pgd_none_or_clear_bad(pgd))
 		goto out;
 	pud = pud_offset(pgd, 0xA0000);
@@ -152,14 +153,16 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 	pmd = pmd_offset(pud, 0xA0000);
 	if (pmd_none_or_clear_bad(pmd))
 		goto out;
-	pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
+	pte = mapped = pte_offset_map(pmd, 0xA0000);
 	for (i = 0; i < 32; i++) {
 		if (pte_present(*pte))
 			set_pte(pte, pte_wrprotect(*pte));
 		pte++;
 	}
-	pte_unmap_unlock(pte, ptl);
+	pte_unmap(mapped);
 out:
+	spin_unlock(&tsk->mm->page_table_lock);
+	preempt_enable();
 	flush_tlb();
 }
 
@@ -303,7 +306,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 
 	tsk->thread.screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
-		mark_screen_rdonly(tsk->mm);
+		mark_screen_rdonly(tsk);
 	__asm__ __volatile__(
 		"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
 		"movl %0,%%esp\n\t"
diff --git a/trunk/arch/i386/mm/discontig.c b/trunk/arch/i386/mm/discontig.c
index c4af9638dbfa..244d8ec66be2 100644
--- a/trunk/arch/i386/mm/discontig.c
+++ b/trunk/arch/i386/mm/discontig.c
@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 
 extern unsigned long find_max_low_pfn(void);
 extern void find_max_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
+extern void one_highpage_init(struct page *, int, int);
 
 extern struct e820map e820;
 extern unsigned long init_pg_tables_end;
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro)
 			if (!pfn_valid(node_pfn))
 				continue;
 			page = pfn_to_page(node_pfn);
-			add_one_highpage_init(page, node_pfn, bad_ppro);
+			one_highpage_init(page, node_pfn, bad_ppro);
 		}
 	}
 	totalram_pages += totalhigh_pages;
diff --git a/trunk/arch/i386/mm/init.c b/trunk/arch/i386/mm/init.c
index 542d9298da5e..2ebaf75f732e 100644
--- a/trunk/arch/i386/mm/init.c
+++ b/trunk/arch/i386/mm/init.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/efi.h>
-#include <linux/memory_hotplug.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -267,46 +266,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;	
 }
 
-void __devinit free_new_highpage(struct page *page)
-{
-	set_page_count(page, 1);
-	__free_page(page);
-	totalhigh_pages++;
-}
-
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
-		free_new_highpage(page);
+		set_page_count(page, 1);
+		__free_page(page);
+		totalhigh_pages++;
 	} else
 		SetPageReserved(page);
 }
 
-static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
-{
-	free_new_highpage(page);
-	totalram_pages++;
-#ifdef CONFIG_FLATMEM
-	max_mapnr = max(pfn, max_mapnr);
-#endif
-	num_physpages++;
-	return 0;
-}
-
-/*
- * Not currently handling the NUMA case.
- * Assuming single node and all memory that
- * has been added dynamically that would be
- * onlined here is in HIGHMEM
- */
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	add_one_highpage_hotplug(page, page_to_pfn(page));
-}
-
-
 #ifdef CONFIG_NUMA
 extern void set_highmem_pages_init(int);
 #else
@@ -314,7 +284,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
 {
 	int pfn;
 	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
-		add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+		one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
 	totalram_pages += totalhigh_pages;
 }
 #endif /* CONFIG_FLATMEM */
@@ -645,28 +615,6 @@ void __init mem_init(void)
 #endif
 }
 
-/*
- * this is for the non-NUMA, single node SMP system case.
- * Specifically, in the case of x86, we will always add
- * memory to the highmem for now.
- */
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-int add_memory(u64 start, u64 size)
-{
-	struct pglist_data *pgdata = &contig_page_data;
-	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-
-	return __add_pages(zone, start_pfn, nr_pages);
-}
-
-int remove_memory(u64 start, u64 size)
-{
-	return -EINVAL;
-}
-#endif
-
 kmem_cache_t *pgd_cache;
 kmem_cache_t *pmd_cache;
 
diff --git a/trunk/arch/i386/mm/ioremap.c b/trunk/arch/i386/mm/ioremap.c
index 5d09de8d1c6b..f379b8d67558 100644
--- a/trunk/arch/i386/mm/ioremap.c
+++ b/trunk/arch/i386/mm/ioremap.c
@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 	unsigned long pfn;
 
 	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel(pmd, addr);
+	pte = pte_alloc_kernel(&init_mm, pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -87,12 +87,14 @@ static int ioremap_page_range(unsigned long addr,
 	flush_cache_all();
 	phys_addr -= addr;
 	pgd = pgd_offset_k(addr);
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return err;
 }
diff --git a/trunk/arch/i386/mm/pgtable.c b/trunk/arch/i386/mm/pgtable.c
index 9db3242103be..dcdce2c6c532 100644
--- a/trunk/arch/i386/mm/pgtable.c
+++ b/trunk/arch/i386/mm/pgtable.c
@@ -31,13 +31,11 @@ void show_mem(void)
 	pg_data_t *pgdat;
 	unsigned long i;
 	struct page_state ps;
-	unsigned long flags;
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -50,7 +48,6 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
-		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk(KERN_INFO "%d pages of RAM\n", total);
 	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
@@ -191,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 	page->index = (unsigned long)pgd_list;
 	if (pgd_list)
-		set_page_private(pgd_list, (unsigned long)&page->index);
+		pgd_list->private = (unsigned long)&page->index;
 	pgd_list = page;
-	set_page_private(page, (unsigned long)&pgd_list);
+	page->private = (unsigned long)&pgd_list;
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *)page->index;
-	pprev = (struct page **)page_private(page);
+	pprev = (struct page **)page->private;
 	*pprev = next;
 	if (next)
-		set_page_private(next, (unsigned long)pprev);
+		next->private = (unsigned long)pprev;
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
diff --git a/trunk/arch/i386/oprofile/backtrace.c b/trunk/arch/i386/oprofile/backtrace.c
index 21654be3f73f..65dfd2edb671 100644
--- a/trunk/arch/i386/oprofile/backtrace.c
+++ b/trunk/arch/i386/oprofile/backtrace.c
@@ -12,7 +12,6 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
 
 struct frame_head {
 	struct frame_head * ebp;
@@ -22,22 +21,26 @@ struct frame_head {
 static struct frame_head *
 dump_backtrace(struct frame_head * head)
 {
-	struct frame_head bufhead[2];
-
-	/* Also check accessibility of one struct frame_head beyond */
-	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
-		return NULL;
-	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
-		return NULL;
-
-	oprofile_add_trace(bufhead[0].ret);
+	oprofile_add_trace(head->ret);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (head >= bufhead[0].ebp)
+	if (head >= head->ebp)
 		return NULL;
 
-	return bufhead[0].ebp;
+	return head->ebp;
+}
+
+/* check that the page(s) containing the frame head are present */
+static int pages_present(struct frame_head * head)
+{
+	struct mm_struct * mm = current->mm;
+
+	/* FIXME: only necessary once per page */
+	if (!check_user_page_readable(mm, (unsigned long)head))
+		return 0;
+
+	return check_user_page_readable(mm, (unsigned long)(head + 1));
 }
 
 /*
@@ -94,6 +97,15 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
-	while (depth-- && head)
+#ifdef CONFIG_SMP
+	if (!spin_trylock(&current->mm->page_table_lock))
+		return;
+#endif
+
+	while (depth-- && head && pages_present(head))
 		head = dump_backtrace(head);
+
+#ifdef CONFIG_SMP
+	spin_unlock(&current->mm->page_table_lock);
+#endif
 }
diff --git a/trunk/arch/ia64/kernel/perfmon.c b/trunk/arch/ia64/kernel/perfmon.c
index f7dfc107cb7b..d71731ee5b61 100644
--- a/trunk/arch/ia64/kernel/perfmon.c
+++ b/trunk/arch/ia64/kernel/perfmon.c
@@ -2352,8 +2352,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	insert_vm_struct(mm, vma);
 
 	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
+	vm_stat_account(vma);
 	up_write(&task->mm->mmap_sem);
 
 	/*
diff --git a/trunk/arch/ia64/mm/discontig.c b/trunk/arch/ia64/mm/discontig.c
index a88cdb7232f8..a3788fb84809 100644
--- a/trunk/arch/ia64/mm/discontig.c
+++ b/trunk/arch/ia64/mm/discontig.c
@@ -555,13 +555,9 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		unsigned long present;
-		unsigned long flags;
+		unsigned long present = pgdat->node_present_pages;
 		int shared = 0, cached = 0, reserved = 0;
-
 		printk("Node ID: %d\n", pgdat->node_id);
-		pgdat_resize_lock(pgdat, &flags);
-		present = pgdat->node_present_pages;
 		for(i = 0; i < pgdat->node_spanned_pages; i++) {
 			struct page *page;
 			if (pfn_valid(pgdat->node_start_pfn + i))
@@ -575,7 +571,6 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page)-1;
 		}
-		pgdat_resize_unlock(pgdat, &flags);
 		total_present += present;
 		total_reserved += reserved;
 		total_cached += cached;
diff --git a/trunk/arch/ia64/mm/fault.c b/trunk/arch/ia64/mm/fault.c
index af7eb087dca7..3c32af910d60 100644
--- a/trunk/arch/ia64/mm/fault.c
+++ b/trunk/arch/ia64/mm/fault.c
@@ -19,6 +19,32 @@
 
 extern void die (char *, struct pt_regs *, long);
 
+/*
+ * This routine is analogous to expand_stack() but instead grows the
+ * register backing store (which grows towards higher addresses).
+ * Since the register backing store is access sequentially, we
+ * disallow growing the RBS by more than a page at a time.  Note that
+ * the VM_GROWSUP flag can be set on any VM area but that's fine
+ * because the total process size is still limited by RLIMIT_STACK and
+ * RLIMIT_AS.
+ */
+static inline long
+expand_backing_store (struct vm_area_struct *vma, unsigned long address)
+{
+	unsigned long grow;
+
+	grow = PAGE_SIZE >> PAGE_SHIFT;
+	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
+	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
+		return -ENOMEM;
+	vma->vm_end += PAGE_SIZE;
+	vma->vm_mm->total_vm += grow;
+	if (vma->vm_flags & VM_LOCKED)
+		vma->vm_mm->locked_vm += grow;
+	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+	return 0;
+}
+
 /*
  * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
  * (inside region 5, on ia64) and that page is present.
@@ -159,13 +185,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
 		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
 			goto bad_area;
-		/*
-		 * Since the register backing store is accessed sequentially,
-		 * we disallow growing it by more than a page at a time.
-		 */
-		if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
-			goto bad_area;
-		if (expand_upwards(vma, address))
+		if (expand_backing_store(vma, address))
 			goto bad_area;
 	}
 	goto good_area;
diff --git a/trunk/arch/ia64/mm/init.c b/trunk/arch/ia64/mm/init.c
index e3215ba64ffd..98246acd4991 100644
--- a/trunk/arch/ia64/mm/init.c
+++ b/trunk/arch/ia64/mm/init.c
@@ -158,7 +158,7 @@ ia64_init_addr_space (void)
 		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
 		vma->vm_end = vma->vm_start + PAGE_SIZE;
 		vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
-		vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
+		vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
 		down_write(&current->mm->mmap_sem);
 		if (insert_vm_struct(current->mm, vma)) {
 			up_write(&current->mm->mmap_sem);
@@ -275,21 +275,26 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 
 	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */
 
+	spin_lock(&init_mm.page_table_lock);
 	{
 		pud = pud_alloc(&init_mm, pgd, address);
 		if (!pud)
 			goto out;
+
 		pmd = pmd_alloc(&init_mm, pud, address);
 		if (!pmd)
 			goto out;
-		pte = pte_alloc_kernel(pmd, address);
+		pte = pte_alloc_map(&init_mm, pmd, address);
 		if (!pte)
 			goto out;
-		if (!pte_none(*pte))
+		if (!pte_none(*pte)) {
+			pte_unmap(pte);
 			goto out;
+		}
 		set_pte(pte, mk_pte(page, pgprot));
+		pte_unmap(pte);
 	}
-  out:
+  out:	spin_unlock(&init_mm.page_table_lock);
 	/* no need for flush_tlb */
 	return page;
 }
diff --git a/trunk/arch/ia64/mm/tlb.c b/trunk/arch/ia64/mm/tlb.c
index c79a9b96d02b..c93e0f2b5fea 100644
--- a/trunk/arch/ia64/mm/tlb.c
+++ b/trunk/arch/ia64/mm/tlb.c
@@ -158,12 +158,10 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
 # ifdef CONFIG_SMP
 	platform_global_tlb_purge(mm, start, end, nbits);
 # else
-	preempt_disable();
 	do {
 		ia64_ptcl(start, (nbits<<2));
 		start += (1UL << nbits);
 	} while (start < end);
-	preempt_enable();
 # endif
 
 	ia64_srlz_i();			/* srlz.i implies srlz.d */
diff --git a/trunk/arch/m32r/mm/init.c b/trunk/arch/m32r/mm/init.c
index 6facf15b04f3..d9a40b1fe8ba 100644
--- a/trunk/arch/m32r/mm/init.c
+++ b/trunk/arch/m32r/mm/init.c
@@ -48,8 +48,6 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -62,7 +60,6 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
-		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%d pages of RAM\n", total);
 	printk("%d pages of HIGHMEM\n",highmem);
@@ -153,14 +150,10 @@ int __init reservedpages_count(void)
 	int reservedpages, nid, i;
 
 	reservedpages = 0;
-	for_each_online_node(nid) {
-		unsigned long flags;
-		pgdat_resize_lock(NODE_DATA(nid), &flags);
+	for_each_online_node(nid)
 		for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
 			if (PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
-		pgdat_resize_unlock(NODE_DATA(nid), &flags);
-	}
 
 	return reservedpages;
 }
diff --git a/trunk/arch/m32r/mm/ioremap.c b/trunk/arch/m32r/mm/ioremap.c
index a151849a605e..70c59055c19c 100644
--- a/trunk/arch/m32r/mm/ioremap.c
+++ b/trunk/arch/m32r/mm/ioremap.c
@@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -90,6 +90,7 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -103,6 +104,7 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/trunk/arch/m68k/Kconfig b/trunk/arch/m68k/Kconfig
index 1dd5d18b2201..ba960bbc8e6d 100644
--- a/trunk/arch/m68k/Kconfig
+++ b/trunk/arch/m68k/Kconfig
@@ -388,11 +388,33 @@ config AMIGA_PCMCIA
 	  Include support in the kernel for pcmcia on Amiga 1200 and Amiga
 	  600. If you intend to use pcmcia cards say Y; otherwise say N.
 
+config STRAM_SWAP
+	bool "Support for ST-RAM as swap space"
+	depends on ATARI && BROKEN
+	---help---
+	  Some Atari 68k machines (including the 520STF and 1020STE) divide
+	  their addressable memory into ST and TT sections.  The TT section
+	  (up to 512MB) is the main memory; the ST section (up to 4MB) is
+	  accessible to the built-in graphics board, runs slower, and is
+	  present mainly for backward compatibility with older machines.
+
+	  This enables support for using (parts of) ST-RAM as swap space,
+	  instead of as normal system memory. This can first enhance system
+	  performance if you have lots of alternate RAM (compared to the size
+	  of ST-RAM), because executable code always will reside in faster
+	  memory. ST-RAM will remain as ultra-fast swap space. On the other
+	  hand, it allows much improved dynamic allocations of ST-RAM buffers
+	  for device driver modules (e.g. floppy, ACSI, SLM printer, DMA
+	  sound). The probability that such allocations at module load time
+	  fail is drastically reduced.
+
 config STRAM_PROC
 	bool "ST-RAM statistics in /proc"
 	depends on ATARI
 	help
-	  Say Y here to report ST-RAM usage statistics in /proc/stram.
+	  Say Y here to report ST-RAM usage statistics in /proc/stram.  See
+	  the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its
+	  uses.
 
 config HEARTBEAT
 	bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40
diff --git a/trunk/arch/m68k/atari/stram.c b/trunk/arch/m68k/atari/stram.c
index 22e0481a5f7b..5a3c106b40c8 100644
--- a/trunk/arch/m68k/atari/stram.c
+++ b/trunk/arch/m68k/atari/stram.c
@@ -15,9 +15,11 @@
 #include <linux/kdev_t.h>
 #include <linux/major.h>
 #include <linux/init.h>
+#include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
+#include <linux/shm.h>
 #include <linux/bootmem.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
@@ -31,6 +33,8 @@
 #include <asm/io.h>
 #include <asm/semaphore.h>
 
+#include <linux/swapops.h>
+
 #undef DEBUG
 
 #ifdef DEBUG
@@ -45,7 +49,8 @@
 #include <linux/proc_fs.h>
 #endif
 
-/*
+/* Pre-swapping comments:
+ *
  * ++roman:
  *
  * New version of ST-Ram buffer allocation. Instead of using the
@@ -70,6 +75,76 @@
  *
  */
 
+/*
+ * New Nov 1997: Use ST-RAM as swap space!
+ *
+ * In the past, there were often problems with modules that require ST-RAM
+ * buffers. Such drivers have to use __get_dma_pages(), which unfortunately
+ * often isn't very successful in allocating more than 1 page :-( [1] The net
+ * result was that most of the time you couldn't insmod such modules (ataflop,
+ * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm,
+ * which needs a 1 MB buffer... :-).
+ *
+ * To overcome this limitation, ST-RAM can now be turned into a very
+ * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel
+ * now tries to unswap some pages on that swap device to make some free (and
+ * contiguous) space. This works much better in comparison to
+ * __get_dma_pages(), since used swap pages can be selectively freed by either
+ * moving them to somewhere else in swap space, or by reading them back into
+ * system memory. Ok, there operation of unswapping isn't really cheap (for
+ * each page, one has to go through the page tables of all processes), but it
+ * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a
+ * module that needs ST-RAM). But it at least makes it possible to load such
+ * modules!
+ *
+ * It could also be that overall system performance increases a bit due to
+ * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or
+ * executing code in. It's then just a (very fast, compared to disk) back
+ * storage for not-so-often needed data. (But this effect must be compared
+ * with the loss of total memory...) Don't know if the effect is already
+ * visible on a TT, where the speed difference between ST- and TT-RAM isn't
+ * that dramatic, but it should on machines where TT-RAM is really much faster
+ * (e.g. Afterburner).
+ *
+ *   [1]: __get_free_pages() does a fine job if you only want one page, but if
+ * you want more (contiguous) pages, it can give you such a block only if
+ * there's already a free one. The algorithm can't try to free buffers or swap
+ * out something in order to make more free space, since all that page-freeing
+ * mechanisms work "target-less", i.e. they just free something, but not in a
+ * specific place. I.e., __get_free_pages() can't do anything to free
+ * *adjacent* pages :-( This situation becomes even worse for DMA memory,
+ * since the freeing algorithms are also blind to DMA capability of pages.
+ */
+
+/* 1998-10-20: ++andreas
+   unswap_by_move disabled because it does not handle swapped shm pages.
+*/
+
+/* 2000-05-01: ++andreas
+   Integrated with bootmem.  Remove all traces of unswap_by_move.
+*/
+
+#ifdef CONFIG_STRAM_SWAP
+#define ALIGN_IF_SWAP(x)	PAGE_ALIGN(x)
+#else
+#define ALIGN_IF_SWAP(x)	(x)
+#endif
+
+/* get index of swap page at address 'addr' */
+#define SWAP_NR(addr)		(((addr) - swap_start) >> PAGE_SHIFT)
+
+/* get address of swap page #'nr' */
+#define SWAP_ADDR(nr)		(swap_start + ((nr) << PAGE_SHIFT))
+
+/* get number of pages for 'n' bytes (already page-aligned) */
+#define N_PAGES(n)			((n) >> PAGE_SHIFT)
+
+/* The following two numbers define the maximum fraction of ST-RAM in total
+ * memory, below that the kernel would automatically use ST-RAM as swap
+ * space. This decision can be overridden with stram_swap= */
+#define MAX_STRAM_FRACTION_NOM		1
+#define MAX_STRAM_FRACTION_DENOM	3
+
 /* Start and end (virtual) of ST-RAM */
 static void *stram_start, *stram_end;
 
@@ -89,9 +164,10 @@ typedef struct stram_block {
 } BLOCK;
 
 /* values for flags field */
-#define BLOCK_FREE	0x01	/* free structure in the BLOCKs pool */
+#define BLOCK_FREE		0x01	/* free structure in the BLOCKs pool */
 #define BLOCK_KMALLOCED	0x02	/* structure allocated by kmalloc() */
-#define BLOCK_GFP	0x08	/* block allocated with __get_dma_pages() */
+#define BLOCK_GFP		0x08	/* block allocated with __get_dma_pages() */
+#define BLOCK_INSWAP	0x10	/* block allocated in swap space */
 
 /* list of allocated blocks */
 static BLOCK *alloc_list;
@@ -103,8 +179,60 @@ static BLOCK *alloc_list;
 #define N_STATIC_BLOCKS	20
 static BLOCK static_blocks[N_STATIC_BLOCKS];
 
+#ifdef CONFIG_STRAM_SWAP
+/* max. number of bytes to use for swapping
+ *  0 = no ST-RAM swapping
+ * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of
+ *      total memory
+ */
+static int max_swap_size = -1;
+
+/* start and end of swapping area */
+static void *swap_start, *swap_end;
+
+/* The ST-RAM's swap info structure */
+static struct swap_info_struct *stram_swap_info;
+
+/* The ST-RAM's swap type */
+static int stram_swap_type;
+
+/* Semaphore for get_stram_region.  */
+static DECLARE_MUTEX(stram_swap_sem);
+
+/* major and minor device number of the ST-RAM device; for the major, we use
+ * the same as Amiga z2ram, which is really similar and impossible on Atari,
+ * and for the minor a relatively odd number to avoid the user creating and
+ * using that device. */
+#define	STRAM_MAJOR		Z2RAM_MAJOR
+#define	STRAM_MINOR		13
+
+/* Some impossible pointer value */
+#define MAGIC_FILE_P	(struct file *)0xffffdead
+
+#ifdef DO_PROC
+static unsigned stat_swap_read;
+static unsigned stat_swap_write;
+static unsigned stat_swap_force;
+#endif /* DO_PROC */
+
+#endif /* CONFIG_STRAM_SWAP */
+
 /***************************** Prototypes *****************************/
 
+#ifdef CONFIG_STRAM_SWAP
+static int swap_init(void *start_mem, void *swap_data);
+static void *get_stram_region( unsigned long n_pages );
+static void free_stram_region( unsigned long offset, unsigned long n_pages
+			       );
+static int in_some_region(void *addr);
+static unsigned long find_free_region( unsigned long n_pages, unsigned long
+				       *total_free, unsigned long
+				       *region_free );
+static void do_stram_request(request_queue_t *);
+static int stram_open( struct inode *inode, struct file *filp );
+static int stram_release( struct inode *inode, struct file *filp );
+static void reserve_region(void *start, void *end);
+#endif
 static BLOCK *add_region( void *addr, unsigned long size );
 static BLOCK *find_region( void *addr );
 static int remove_region( BLOCK *block );
@@ -151,11 +279,84 @@ void __init atari_stram_init(void)
  */
 void __init atari_stram_reserve_pages(void *start_mem)
 {
+#ifdef CONFIG_STRAM_SWAP
+	/* if max_swap_size is negative (i.e. no stram_swap= option given),
+	 * determine at run time whether to use ST-RAM swapping */
+	if (max_swap_size < 0)
+		/* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION
+		 * of total memory. In that case, the max. size is set to 16 MB,
+		 * because ST-RAM can never be bigger than that.
+		 * Also, never use swapping on a Hades, there's no separate ST-RAM in
+		 * that machine. */
+		max_swap_size =
+			(!MACH_IS_HADES &&
+			 (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <=
+			  ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0;
+	DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size );
+#endif
+
 	/* always reserve first page of ST-RAM, the first 2 kB are
 	 * supervisor-only! */
 	if (!kernel_in_stram)
 		reserve_bootmem (0, PAGE_SIZE);
 
+#ifdef CONFIG_STRAM_SWAP
+	{
+		void *swap_data;
+
+		start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem);
+		/* determine first page to use as swap: if the kernel is
+		   in TT-RAM, this is the first page of (usable) ST-RAM;
+		   otherwise just use the end of kernel data (= start_mem) */
+		swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem;
+		/* decrement by one page, rest of kernel assumes that first swap page
+		 * is always reserved and maybe doesn't handle swp_entry == 0
+		 * correctly */
+		swap_start -= PAGE_SIZE;
+		swap_end = stram_end;
+		if (swap_end-swap_start > max_swap_size)
+			swap_end =  swap_start + max_swap_size;
+		DPRINTK( "atari_stram_reserve_pages: swapping enabled; "
+				 "swap=%p-%p\n", swap_start, swap_end);
+
+		/* reserve some amount of memory for maintainance of
+		 * swapping itself: one page for each 2048 (PAGE_SIZE/2)
+		 * swap pages. (2 bytes for each page) */
+		swap_data = start_mem;
+		start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1)
+			      >> (PAGE_SHIFT-1)) << PAGE_SHIFT;
+		/* correct swap_start if necessary */
+		if (swap_start + PAGE_SIZE == swap_data)
+			swap_start = start_mem - PAGE_SIZE;
+
+		if (!swap_init( start_mem, swap_data )) {
+			printk( KERN_ERR "ST-RAM swap space initialization failed\n" );
+			max_swap_size = 0;
+			return;
+		}
+		/* reserve region for swapping meta-data */
+		reserve_region(swap_data, start_mem);
+		/* reserve swapping area itself */
+		reserve_region(swap_start + PAGE_SIZE, swap_end);
+
+		/*
+		 * If the whole ST-RAM is used for swapping, there are no allocatable
+		 * dma pages left. But unfortunately, some shared parts of the kernel
+		 * (particularly the SCSI mid-level) call __get_dma_pages()
+		 * unconditionally :-( These calls then fail, and scsi.c even doesn't
+		 * check for NULL return values and just crashes. The quick fix for
+		 * this (instead of doing much clean up work in the SCSI code) is to
+		 * pretend all pages are DMA-able by setting mach_max_dma_address to
+		 * ULONG_MAX. This doesn't change any functionality so far, since
+		 * get_dma_pages() shouldn't be used on Atari anyway anymore (better
+		 * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA
+		 * memory. But unfortunately there's now no kind of warning (even not
+		 * a NULL return value) if you use get_dma_pages() nevertheless :-(
+		 * You just will get non-DMA-able memory...
+		 */
+		mach_max_dma_address = 0xffffffff;
+	}
+#endif
 }
 
 void atari_stram_mem_init_hook (void)
@@ -166,6 +367,7 @@ void atari_stram_mem_init_hook (void)
 
 /*
  * This is main public interface: somehow allocate a ST-RAM block
+ * There are three strategies:
  *
  *  - If we're before mem_init(), we have to make a static allocation. The
  *    region is taken in the kernel data area (if the kernel is in ST-RAM) or
@@ -173,9 +375,14 @@ void atari_stram_mem_init_hook (void)
  *    rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel
  *    address space in the latter case.
  *
- *  - If mem_init() already has been called, try with __get_dma_pages().
- *    This has the disadvantage that it's very hard to get more than 1 page,
- *    and it is likely to fail :-(
+ *  - If mem_init() already has been called and ST-RAM swapping is enabled,
+ *    try to get the memory from the (pseudo) swap-space, either free already
+ *    or by moving some other pages out of the swap.
+ *
+ *  - If mem_init() already has been called, and ST-RAM swapping is not
+ *    enabled, the only possibility is to try with __get_dma_pages(). This has
+ *    the disadvantage that it's very hard to get more than 1 page, and it is
+ *    likely to fail :-(
  *
  */
 void *atari_stram_alloc(long size, const char *owner)
@@ -186,13 +393,27 @@ void *atari_stram_alloc(long size, const char *owner)
 
 	DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner);
 
+	size = ALIGN_IF_SWAP(size);
+	DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size );
+#ifdef CONFIG_STRAM_SWAP
+	if (max_swap_size) {
+		/* If swapping is active: make some free space in the swap
+		   "device". */
+		DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, "
+				 "calling get_region\n" );
+		addr = get_stram_region( N_PAGES(size) );
+		flags = BLOCK_INSWAP;
+	}
+	else
+#endif
 	if (!mem_init_done)
 		return alloc_bootmem_low(size);
 	else {
-		/* After mem_init(): can only resort to __get_dma_pages() */
+		/* After mem_init() and no swapping: can only resort to
+		 * __get_dma_pages() */
 		addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size));
 		flags = BLOCK_GFP;
-		DPRINTK( "atari_stram_alloc: after mem_init, "
+		DPRINTK( "atari_stram_alloc: after mem_init, swapping off, "
 				 "get_pages=%p\n", addr );
 	}
 
@@ -201,7 +422,12 @@ void *atari_stram_alloc(long size, const char *owner)
 			/* out of memory for BLOCK structure :-( */
 			DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- "
 					 "freeing again\n" );
-			free_pages((unsigned long)addr, get_order(size));
+#ifdef CONFIG_STRAM_SWAP
+			if (flags == BLOCK_INSWAP)
+				free_stram_region( SWAP_NR(addr), N_PAGES(size) );
+			else
+#endif
+				free_pages((unsigned long)addr, get_order(size));
 			return( NULL );
 		}
 		block->owner = owner;
@@ -225,12 +451,25 @@ void atari_stram_free( void *addr )
 	DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, "
 			 "flags=%02x\n", block, block->size, block->owner, block->flags );
 
-	if (!(block->flags & BLOCK_GFP))
+#ifdef CONFIG_STRAM_SWAP
+	if (!max_swap_size) {
+#endif
+		if (block->flags & BLOCK_GFP) {
+			DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
+				get_order(block->size));
+			free_pages((unsigned long)addr, get_order(block->size));
+		}
+		else
+			goto fail;
+#ifdef CONFIG_STRAM_SWAP
+	}
+	else if (block->flags & BLOCK_INSWAP) {
+		DPRINTK( "atari_stram_free: is swap-alloced\n" );
+		free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) );
+	}
+	else
 		goto fail;
-
-	DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
-		get_order(block->size));
-	free_pages((unsigned long)addr, get_order(block->size));
+#endif
 	remove_region( block );
 	return;
 
@@ -239,6 +478,612 @@ void atari_stram_free( void *addr )
 			"(called from %p)\n", addr, __builtin_return_address(0) );
 }
 
+
+#ifdef CONFIG_STRAM_SWAP
+
+
+/* ------------------------------------------------------------------------ */
+/*						   Main Swapping Functions							*/
+/* ------------------------------------------------------------------------ */
+
+
+/*
+ * Initialize ST-RAM swap device
+ * (lots copied and modified from sys_swapon() in mm/swapfile.c)
+ */
+static int __init swap_init(void *start_mem, void *swap_data)
+{
+	static struct dentry fake_dentry;
+	static struct vfsmount fake_vfsmnt;
+	struct swap_info_struct *p;
+	struct inode swap_inode;
+	unsigned int type;
+	void *addr;
+	int i, j, k, prev;
+
+	DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n",
+		start_mem, swap_data);
+
+	/* need at least one page for swapping to (and this also isn't very
+	 * much... :-) */
+	if (swap_end - swap_start < 2*PAGE_SIZE) {
+		printk( KERN_WARNING "stram_swap_init: swap space too small\n" );
+		return( 0 );
+	}
+
+	/* find free slot in swap_info */
+	for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ )
+		if (!(p->flags & SWP_USED))
+			break;
+	if (type >= MAX_SWAPFILES) {
+		printk( KERN_WARNING "stram_swap_init: max. number of "
+				"swap devices exhausted\n" );
+		return( 0 );
+	}
+	if (type >= nr_swapfiles)
+		nr_swapfiles = type+1;
+
+	stram_swap_info = p;
+	stram_swap_type = type;
+
+	/* fake some dir cache entries to give us some name in /dev/swaps */
+	fake_dentry.d_parent = &fake_dentry;
+	fake_dentry.d_name.name = "stram (internal)";
+	fake_dentry.d_name.len = 16;
+	fake_vfsmnt.mnt_parent = &fake_vfsmnt;
+
+	p->flags        = SWP_USED;
+	p->swap_file    = &fake_dentry;
+	p->swap_vfsmnt  = &fake_vfsmnt;
+	p->swap_map	= swap_data;
+	p->cluster_nr   = 0;
+	p->next         = -1;
+	p->prio         = 0x7ff0;	/* a rather high priority, but not the higest
+								 * to give the user a chance to override */
+
+	/* call stram_open() directly, avoids at least the overhead in
+	 * constructing a dummy file structure... */
+	swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR );
+	stram_open( &swap_inode, MAGIC_FILE_P );
+	p->max = SWAP_NR(swap_end);
+
+	/* initialize swap_map: set regions that are already allocated or belong
+	 * to kernel data space to SWAP_MAP_BAD, otherwise to free */
+	j = 0; /* # of free pages */
+	k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */
+	p->lowest_bit = 0;
+	p->highest_bit = 0;
+	for( i = 1, addr = SWAP_ADDR(1); i < p->max;
+		 i++, addr += PAGE_SIZE ) {
+		if (in_some_region( addr )) {
+			p->swap_map[i] = SWAP_MAP_BAD;
+			++k;
+		}
+		else if (kernel_in_stram && addr < start_mem ) {
+			p->swap_map[i] = SWAP_MAP_BAD;
+		}
+		else {
+			p->swap_map[i] = 0;
+			++j;
+			if (!p->lowest_bit) p->lowest_bit = i;
+			p->highest_bit = i;
+		}
+	}
+	/* first page always reserved (and doesn't really belong to swap space) */
+	p->swap_map[0] = SWAP_MAP_BAD;
+
+	/* now swapping to this device ok */
+	p->pages = j + k;
+	swap_list_lock();
+	nr_swap_pages += j;
+	p->flags = SWP_WRITEOK;
+
+	/* insert swap space into swap_list */
+	prev = -1;
+	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+		if (p->prio >= swap_info[i].prio) {
+			break;
+		}
+		prev = i;
+	}
+	p->next = i;
+	if (prev < 0) {
+		swap_list.head = swap_list.next = p - swap_info;
+	} else {
+		swap_info[prev].next = p - swap_info;
+	}
+	swap_list_unlock();
+
+	printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n",
+			p->pages << 2, p->pages );
+	return( 1 );
+}
+
+
+/*
+ * The swap entry has been read in advance, and we return 1 to indicate
+ * that the page has been used or is no longer needed.
+ *
+ * Always set the resulting pte to be nowrite (the same as COW pages
+ * after one process has exited).  We don't know just how many PTEs will
+ * share this swap entry, so be cautious and let do_wp_page work out
+ * what to do if a write is requested later.
+ */
+static inline void unswap_pte(struct vm_area_struct * vma, unsigned long
+			      address, pte_t *dir, swp_entry_t entry,
+			      struct page *page)
+{
+	pte_t pte = *dir;
+
+	if (pte_none(pte))
+		return;
+	if (pte_present(pte)) {
+		/* If this entry is swap-cached, then page must already
+                   hold the right address for any copies in physical
+                   memory */
+		if (pte_page(pte) != page)
+			return;
+		/* We will be removing the swap cache in a moment, so... */
+		set_pte(dir, pte_mkdirty(pte));
+		return;
+	}
+	if (pte_val(pte) != entry.val)
+		return;
+
+	DPRINTK("unswap_pte: replacing entry %08lx by new page %p",
+		entry.val, page);
+	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	swap_free(entry);
+	get_page(page);
+	inc_mm_counter(vma->vm_mm, rss);
+}
+
+static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir,
+			      unsigned long address, unsigned long size,
+			      unsigned long offset, swp_entry_t entry,
+			      struct page *page)
+{
+	pte_t * pte;
+	unsigned long end;
+
+	if (pmd_none(*dir))
+		return;
+	if (pmd_bad(*dir)) {
+		pmd_ERROR(*dir);
+		pmd_clear(dir);
+		return;
+	}
+	pte = pte_offset_kernel(dir, address);
+	offset += address & PMD_MASK;
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	do {
+		unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page);
+		address += PAGE_SIZE;
+		pte++;
+	} while (address < end);
+}
+
+static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir,
+			      unsigned long address, unsigned long size,
+			      swp_entry_t entry, struct page *page)
+{
+	pmd_t * pmd;
+	unsigned long offset, end;
+
+	if (pgd_none(*dir))
+		return;
+	if (pgd_bad(*dir)) {
+		pgd_ERROR(*dir);
+		pgd_clear(dir);
+		return;
+	}
+	pmd = pmd_offset(dir, address);
+	offset = address & PGDIR_MASK;
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	do {
+		unswap_pmd(vma, pmd, address, end - address, offset, entry,
+			   page);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address < end);
+}
+
+static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+		       swp_entry_t entry, struct page *page)
+{
+	unsigned long start = vma->vm_start, end = vma->vm_end;
+
+	do {
+		unswap_pgd(vma, pgdir, start, end - start, entry, page);
+		start = (start + PGDIR_SIZE) & PGDIR_MASK;
+		pgdir++;
+	} while (start < end);
+}
+
+static void unswap_process(struct mm_struct * mm, swp_entry_t entry,
+			   struct page *page)
+{
+	struct vm_area_struct* vma;
+
+	/*
+	 * Go through process' page directory.
+	 */
+	if (!mm)
+		return;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
+		unswap_vma(vma, pgd, entry, page);
+	}
+}
+
+
+static int unswap_by_read(unsigned short *map, unsigned long max,
+			  unsigned long start, unsigned long n_pages)
+{
+	struct task_struct *p;
+	struct page *page;
+	swp_entry_t entry;
+	unsigned long i;
+
+	DPRINTK( "unswapping %lu..%lu by reading in\n",
+			 start, start+n_pages-1 );
+
+	for( i = start; i < start+n_pages; ++i ) {
+		if (map[i] == SWAP_MAP_BAD) {
+			printk( KERN_ERR "get_stram_region: page %lu already "
+					"reserved??\n", i );
+			continue;
+		}
+
+		if (map[i]) {
+			entry = swp_entry(stram_swap_type, i);
+			DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n",
+				i, map[i], nr_swap_pages);
+
+			swap_device_lock(stram_swap_info);
+			map[i]++;
+			swap_device_unlock(stram_swap_info);
+			/* Get a page for the entry, using the existing
+			   swap cache page if there is one.  Otherwise,
+			   get a clean page and read the swap into it. */
+			page = read_swap_cache_async(entry, NULL, 0);
+			if (!page) {
+				swap_free(entry);
+				return -ENOMEM;
+			}
+			read_lock(&tasklist_lock);
+			for_each_process(p)
+				unswap_process(p->mm, entry, page);
+			read_unlock(&tasklist_lock);
+			shmem_unuse(entry, page);
+			/* Now get rid of the extra reference to the
+			   temporary page we've been using. */
+			if (PageSwapCache(page))
+				delete_from_swap_cache(page);
+			__free_page(page);
+	#ifdef DO_PROC
+			stat_swap_force++;
+	#endif
+		}
+
+		DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n",
+				 i, map[i], nr_swap_pages );
+		swap_list_lock();
+		swap_device_lock(stram_swap_info);
+		map[i] = SWAP_MAP_BAD;
+		if (stram_swap_info->lowest_bit == i)
+			stram_swap_info->lowest_bit++;
+		if (stram_swap_info->highest_bit == i)
+			stram_swap_info->highest_bit--;
+		--nr_swap_pages;
+		swap_device_unlock(stram_swap_info);
+		swap_list_unlock();
+	}
+
+	return 0;
+}
+
+/*
+ * reserve a region in ST-RAM swap space for an allocation
+ */
+static void *get_stram_region( unsigned long n_pages )
+{
+	unsigned short *map = stram_swap_info->swap_map;
+	unsigned long max = stram_swap_info->max;
+	unsigned long start, total_free, region_free;
+	int err;
+	void *ret = NULL;
+
+	DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages );
+
+	down(&stram_swap_sem);
+
+	/* disallow writing to the swap device now */
+	stram_swap_info->flags = SWP_USED;
+
+	/* find a region of n_pages pages in the swap space including as much free
+	 * pages as possible (and excluding any already-reserved pages). */
+	if (!(start = find_free_region( n_pages, &total_free, &region_free )))
+		goto end;
+	DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n",
+			 start, region_free );
+
+	err = unswap_by_read(map, max, start, n_pages);
+	if (err)
+		goto end;
+
+	ret = SWAP_ADDR(start);
+  end:
+	/* allow using swap device again */
+	stram_swap_info->flags = SWP_WRITEOK;
+	up(&stram_swap_sem);
+	DPRINTK( "get_stram_region: returning %p\n", ret );
+	return( ret );
+}
+
+
+/*
+ * free a reserved region in ST-RAM swap space
+ */
+static void free_stram_region( unsigned long offset, unsigned long n_pages )
+{
+	unsigned short *map = stram_swap_info->swap_map;
+
+	DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages );
+
+	if (offset < 1 || offset + n_pages > stram_swap_info->max) {
+		printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" );
+		return;
+	}
+
+	swap_list_lock();
+	swap_device_lock(stram_swap_info);
+	/* un-reserve the freed pages */
+	for( ; n_pages > 0; ++offset, --n_pages ) {
+		if (map[offset] != SWAP_MAP_BAD)
+			printk( KERN_ERR "free_stram_region: Swap page %lu was not "
+					"reserved\n", offset );
+		map[offset] = 0;
+	}
+
+	/* update swapping meta-data */
+	if (offset < stram_swap_info->lowest_bit)
+		stram_swap_info->lowest_bit = offset;
+	if (offset+n_pages-1 > stram_swap_info->highest_bit)
+		stram_swap_info->highest_bit = offset+n_pages-1;
+	if (stram_swap_info->prio > swap_info[swap_list.next].prio)
+		swap_list.next = swap_list.head;
+	nr_swap_pages += n_pages;
+	swap_device_unlock(stram_swap_info);
+	swap_list_unlock();
+}
+
+
+/* ------------------------------------------------------------------------ */
+/*						Utility Functions for Swapping						*/
+/* ------------------------------------------------------------------------ */
+
+
+/* is addr in some of the allocated regions? */
+static int in_some_region(void *addr)
+{
+	BLOCK *p;
+
+	for( p = alloc_list; p; p = p->next ) {
+		if (p->start <= addr && addr < p->start + p->size)
+			return( 1 );
+	}
+	return( 0 );
+}
+
+
+static unsigned long find_free_region(unsigned long n_pages,
+				      unsigned long *total_free,
+				      unsigned long *region_free)
+{
+	unsigned short *map = stram_swap_info->swap_map;
+	unsigned long max = stram_swap_info->max;
+	unsigned long head, tail, max_start;
+	long nfree, max_free;
+
+	/* first scan the swap space for a suitable place for the allocation */
+	head = 1;
+	max_start = 0;
+	max_free = -1;
+	*total_free = 0;
+
+  start_over:
+	/* increment tail until final window size reached, and count free pages */
+	nfree = 0;
+	for( tail = head; tail-head < n_pages && tail < max; ++tail ) {
+		if (map[tail] == SWAP_MAP_BAD) {
+			head = tail+1;
+			goto start_over;
+		}
+		if (!map[tail]) {
+			++nfree;
+			++*total_free;
+		}
+	}
+	if (tail-head < n_pages)
+		goto out;
+	if (nfree > max_free) {
+		max_start = head;
+		max_free  = nfree;
+		if (max_free >= n_pages)
+			/* don't need more free pages... :-) */
+			goto out;
+	}
+
+	/* now shift the window and look for the area where as much pages as
+	 * possible are free */
+	while( tail < max ) {
+		nfree -= (map[head++] == 0);
+		if (map[tail] == SWAP_MAP_BAD) {
+			head = tail+1;
+			goto start_over;
+		}
+		if (!map[tail]) {
+			++nfree;
+			++*total_free;
+		}
+		++tail;
+		if (nfree > max_free) {
+			max_start = head;
+			max_free  = nfree;
+			if (max_free >= n_pages)
+				/* don't need more free pages... :-) */
+				goto out;
+		}
+	}
+
+  out:
+	if (max_free < 0) {
+		printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented "
+				"-- can't allocate %lu pages\n", n_pages );
+		return( 0 );
+	}
+
+	*region_free = max_free;
+	return( max_start );
+}
+
+
+/* setup parameters from command line */
+void __init stram_swap_setup(char *str, int *ints)
+{
+	if (ints[0] >= 1)
+		max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK;
+}
+
+
+/* ------------------------------------------------------------------------ */
+/*								ST-RAM device								*/
+/* ------------------------------------------------------------------------ */
+
+static int refcnt;
+
+static void do_stram_request(request_queue_t *q)
+{
+	struct request *req;
+
+	while ((req = elv_next_request(q)) != NULL) {
+		void *start = swap_start + (req->sector << 9);
+		unsigned long len = req->current_nr_sectors << 9;
+		if ((start + len) > swap_end) {
+			printk( KERN_ERR "stram: bad access beyond end of device: "
+					"block=%ld, count=%d\n",
+					req->sector,
+					req->current_nr_sectors );
+			end_request(req, 0);
+			continue;
+		}
+
+		if (req->cmd == READ) {
+			memcpy(req->buffer, start, len);
+#ifdef DO_PROC
+			stat_swap_read += N_PAGES(len);
+#endif
+		}
+		else {
+			memcpy(start, req->buffer, len);
+#ifdef DO_PROC
+			stat_swap_write += N_PAGES(len);
+#endif
+		}
+		end_request(req, 1);
+	}
+}
+
+
+static int stram_open( struct inode *inode, struct file *filp )
+{
+	if (filp != MAGIC_FILE_P) {
+		printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" );
+		return( -EPERM );
+	}
+	if (refcnt)
+		return( -EBUSY );
+	++refcnt;
+	return( 0 );
+}
+
+static int stram_release( struct inode *inode, struct file *filp )
+{
+	if (filp != MAGIC_FILE_P) {
+		printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" );
+		return( -EPERM );
+	}
+	if (refcnt > 0)
+		--refcnt;
+	return( 0 );
+}
+
+
+static struct block_device_operations stram_fops = {
+	.open =		stram_open,
+	.release =	stram_release,
+};
+
+static struct gendisk *stram_disk;
+static struct request_queue *stram_queue;
+static DEFINE_SPINLOCK(stram_lock);
+
+int __init stram_device_init(void)
+{
+	if (!MACH_IS_ATARI)
+		/* no point in initializing this, I hope */
+		return -ENXIO;
+
+	if (!max_swap_size)
+		/* swapping not enabled */
+		return -ENXIO;
+	stram_disk = alloc_disk(1);
+	if (!stram_disk)
+		return -ENOMEM;
+
+	if (register_blkdev(STRAM_MAJOR, "stram")) {
+		put_disk(stram_disk);
+		return -ENXIO;
+	}
+
+	stram_queue = blk_init_queue(do_stram_request, &stram_lock);
+	if (!stram_queue) {
+		unregister_blkdev(STRAM_MAJOR, "stram");
+		put_disk(stram_disk);
+		return -ENOMEM;
+	}
+
+	stram_disk->major = STRAM_MAJOR;
+	stram_disk->first_minor = STRAM_MINOR;
+	stram_disk->fops = &stram_fops;
+	stram_disk->queue = stram_queue;
+	sprintf(stram_disk->disk_name, "stram");
+	set_capacity(stram_disk, (swap_end - swap_start)/512);
+	add_disk(stram_disk);
+	return 0;
+}
+
+
+
+/* ------------------------------------------------------------------------ */
+/*							Misc Utility Functions							*/
+/* ------------------------------------------------------------------------ */
+
+/* reserve a range of pages */
+static void reserve_region(void *start, void *end)
+{
+	reserve_bootmem (virt_to_phys(start), end - start);
+}
+
+#endif /* CONFIG_STRAM_SWAP */
+
 
 /* ------------------------------------------------------------------------ */
 /*							  Region Management								*/
@@ -328,9 +1173,50 @@ int get_stram_list( char *buf )
 {
 	int len = 0;
 	BLOCK *p;
+#ifdef CONFIG_STRAM_SWAP
+	int i;
+	unsigned short *map = stram_swap_info->swap_map;
+	unsigned long max = stram_swap_info->max;
+	unsigned free = 0, used = 0, rsvd = 0;
+#endif
 
-	PRINT_PROC("Total ST-RAM:      %8u kB\n",
+#ifdef CONFIG_STRAM_SWAP
+	if (max_swap_size) {
+		for( i = 1; i < max; ++i ) {
+			if (!map[i])
+				++free;
+			else if (map[i] == SWAP_MAP_BAD)
+				++rsvd;
+			else
+				++used;
+		}
+		PRINT_PROC(
+			"Total ST-RAM:      %8u kB\n"
+			"Total ST-RAM swap: %8lu kB\n"
+			"Free swap:         %8u kB\n"
+			"Used swap:         %8u kB\n"
+			"Allocated swap:    %8u kB\n"
+			"Swap Reads:        %8u\n"
+			"Swap Writes:       %8u\n"
+			"Swap Forced Reads: %8u\n",
+			(stram_end - stram_start) >> 10,
+			(max-1) << (PAGE_SHIFT-10),
+			free << (PAGE_SHIFT-10),
+			used << (PAGE_SHIFT-10),
+			rsvd << (PAGE_SHIFT-10),
+			stat_swap_read,
+			stat_swap_write,
+			stat_swap_force );
+	}
+	else {
+#endif
+		PRINT_PROC( "ST-RAM swapping disabled\n" );
+		PRINT_PROC("Total ST-RAM:      %8u kB\n",
 			   (stram_end - stram_start) >> 10);
+#ifdef CONFIG_STRAM_SWAP
+	}
+#endif
+
 	PRINT_PROC( "Allocated regions:\n" );
 	for( p = alloc_list; p; p = p->next ) {
 		if (len + 50 >= PAGE_SIZE)
@@ -341,6 +1227,8 @@ int get_stram_list( char *buf )
 			   p->owner);
 		if (p->flags & BLOCK_GFP)
 			PRINT_PROC( "page-alloced)\n" );
+		else if (p->flags & BLOCK_INSWAP)
+			PRINT_PROC( "in swap)\n" );
 		else
 			PRINT_PROC( "??)\n" );
 	}
diff --git a/trunk/arch/m68k/mm/kmap.c b/trunk/arch/m68k/mm/kmap.c
index fe2383e36b06..5dcb3fa35ea9 100644
--- a/trunk/arch/m68k/mm/kmap.c
+++ b/trunk/arch/m68k/mm/kmap.c
@@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag)
 			virtaddr += PTRTREESIZE;
 			size -= PTRTREESIZE;
 		} else {
-			pte_dir = pte_alloc_kernel(pmd_dir, virtaddr);
+			pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr);
 			if (!pte_dir) {
 				printk("ioremap: no mem for pte_dir\n");
 				return NULL;
diff --git a/trunk/arch/m68k/sun3x/dvma.c b/trunk/arch/m68k/sun3x/dvma.c
index 117481e86305..32e55adfeb8e 100644
--- a/trunk/arch/m68k/sun3x/dvma.c
+++ b/trunk/arch/m68k/sun3x/dvma.c
@@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr,
 			pte_t *pte;
 			unsigned long end3;
 
-			if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) {
+			if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) {
 				ret = -ENOMEM;
 				goto out;
 			}
diff --git a/trunk/arch/mips/kernel/irixelf.c b/trunk/arch/mips/kernel/irixelf.c
index 7ce34d4aa220..99262fe64560 100644
--- a/trunk/arch/mips/kernel/irixelf.c
+++ b/trunk/arch/mips/kernel/irixelf.c
@@ -697,6 +697,7 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* Do this so that we can load the interpreter, if need be.  We will
 	 * change some of these later.
 	 */
+	set_mm_counter(current->mm, rss, 0);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 
diff --git a/trunk/arch/mips/mm/ioremap.c b/trunk/arch/mips/mm/ioremap.c
index 3101d1db5592..9c44ca70befa 100644
--- a/trunk/arch/mips/mm/ioremap.c
+++ b/trunk/arch/mips/mm/ioremap.c
@@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -77,6 +77,7 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pmd_t *pmd;
@@ -95,6 +96,7 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/trunk/arch/parisc/kernel/cache.c b/trunk/arch/parisc/kernel/cache.c
index a065349aee37..e15f09eaed12 100644
--- a/trunk/arch/parisc/kernel/cache.c
+++ b/trunk/arch/parisc/kernel/cache.c
@@ -270,6 +270,7 @@ void flush_dcache_page(struct page *page)
 	unsigned long offset;
 	unsigned long addr;
 	pgoff_t pgoff;
+	pte_t *pte;
 	unsigned long pfn = page_to_pfn(page);
 
 
@@ -300,16 +301,21 @@ void flush_dcache_page(struct page *page)
 		 * taking a page fault if the pte doesn't exist.
 		 * This is just for speed.  If the page translation
 		 * isn't there, there's no point exciting the
-		 * nadtlb handler into a nullification frenzy.
-		 *
-		 * Make sure we really have this page: the private
+		 * nadtlb handler into a nullification frenzy */
+
+
+  		if(!(pte = translation_exists(mpnt, addr)))
+			continue;
+
+		/* make sure we really have this page: the private
 		 * mappings may cover this area but have COW'd this
-		 * particular page.
-		 */
-  		if (translation_exists(mpnt, addr, pfn)) {
-			__flush_cache_page(mpnt, addr);
-			break;
-		}
+		 * particular page */
+		if(pte_pfn(*pte) != pfn)
+  			continue;
+
+		__flush_cache_page(mpnt, addr);
+
+		break;
 	}
 	flush_dcache_mmap_unlock(mapping);
 }
diff --git a/trunk/arch/parisc/kernel/pci-dma.c b/trunk/arch/parisc/kernel/pci-dma.c
index f94a02ef3d95..ae6213d71670 100644
--- a/trunk/arch/parisc/kernel/pci-dma.c
+++ b/trunk/arch/parisc/kernel/pci-dma.c
@@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr,
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, vaddr);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr);
 		if (!pte)
 			return -ENOMEM;
 		if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr))
diff --git a/trunk/arch/parisc/mm/init.c b/trunk/arch/parisc/mm/init.c
index 29b998e430e6..2886ad70db48 100644
--- a/trunk/arch/parisc/mm/init.c
+++ b/trunk/arch/parisc/mm/init.c
@@ -505,9 +505,7 @@ void show_mem(void)
 
 		for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
 			struct page *p;
-			unsigned long flags;
 
-			pgdat_resize_lock(NODE_DATA(i), &flags);
 			p = nid_page_nr(i, j) - node_start_pfn(i);
 
 			total++;
@@ -519,7 +517,6 @@ void show_mem(void)
 				free++;
 			else
 				shared += page_count(p) - 1;
-			pgdat_resize_unlock(NODE_DATA(i), &flags);
         	}
 	}
 #endif
diff --git a/trunk/arch/parisc/mm/ioremap.c b/trunk/arch/parisc/mm/ioremap.c
index 5c7a1b3b9326..f2df502cdae3 100644
--- a/trunk/arch/parisc/mm/ioremap.c
+++ b/trunk/arch/parisc/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(NULL, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -75,9 +75,10 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
-		pmd = pmd_alloc(&init_mm, dir, address);
+		pmd = pmd_alloc(dir, address);
 		error = -ENOMEM;
 		if (!pmd)
 			break;
@@ -88,6 +89,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/trunk/arch/ppc/kernel/dma-mapping.c b/trunk/arch/ppc/kernel/dma-mapping.c
index 685fd0defe23..0f710d2baec6 100644
--- a/trunk/arch/ppc/kernel/dma-mapping.c
+++ b/trunk/arch/ppc/kernel/dma-mapping.c
@@ -335,6 +335,8 @@ static int __init dma_alloc_init(void)
 	pte_t *pte;
 	int ret = 0;
 
+	spin_lock(&init_mm.page_table_lock);
+
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -345,7 +347,7 @@ static int __init dma_alloc_init(void)
 		}
 		WARN_ON(!pmd_none(*pmd));
 
-		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
+		pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
 			ret = -ENOMEM;
@@ -355,6 +357,8 @@ static int __init dma_alloc_init(void)
 		consistent_pte = pte;
 	} while (0);
 
+	spin_unlock(&init_mm.page_table_lock);
+
 	return ret;
 }
 
diff --git a/trunk/arch/ppc/mm/4xx_mmu.c b/trunk/arch/ppc/mm/4xx_mmu.c
index 4d006aa1a0d1..b7bcbc232f39 100644
--- a/trunk/arch/ppc/mm/4xx_mmu.c
+++ b/trunk/arch/ppc/mm/4xx_mmu.c
@@ -110,11 +110,13 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
+		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
+		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_16M;
 		p += LARGE_PAGE_SIZE_16M;
@@ -125,8 +127,10 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
+		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp) = val;
+		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_4M;
 		p += LARGE_PAGE_SIZE_4M;
diff --git a/trunk/arch/ppc/mm/pgtable.c b/trunk/arch/ppc/mm/pgtable.c
index 6ea9185fd120..43505b1fc5d8 100644
--- a/trunk/arch/ppc/mm/pgtable.c
+++ b/trunk/arch/ppc/mm/pgtable.c
@@ -280,16 +280,18 @@ map_page(unsigned long va, phys_addr_t pa, int flags)
 	pte_t *pg;
 	int err = -ENOMEM;
 
+	spin_lock(&init_mm.page_table_lock);
 	/* Use upper 10 bits of VA to index the first level map */
 	pd = pmd_offset(pgd_offset_k(va), va);
 	/* Use middle 10 bits of VA to index the second-level map */
-	pg = pte_alloc_kernel(pd, va);
+	pg = pte_alloc_kernel(&init_mm, pd, va);
 	if (pg != 0) {
 		err = 0;
 		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
 		if (mem_init_done)
 			flush_HPTE(0, va, pmd_val(*pd));
 	}
+	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 
diff --git a/trunk/arch/ppc64/kernel/vdso.c b/trunk/arch/ppc64/kernel/vdso.c
index 4aacf521e3e4..efa985f05aca 100644
--- a/trunk/arch/ppc64/kernel/vdso.c
+++ b/trunk/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
 		return NOPAGE_SIGBUS;
 
 	/*
-	 * Last page is systemcfg.
+	 * Last page is systemcfg, special handling here, no get_page() a
+	 * this is a reserved page
 	 */
 	if ((vma->vm_end - address) <= PAGE_SIZE)
-		pg = virt_to_page(systemcfg);
-	else
-		pg = virt_to_page(vbase + offset);
+		return virt_to_page(systemcfg);
 
+	pg = virt_to_page(vbase + offset);
 	get_page(pg);
 	DBG(" ->page count: %d\n", page_count(pg));
 
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
 	 * gettimeofday will be totally dead. It's fine to use that for setting
 	 * breakpoints in the vDSO code pages though
 	 */
-	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 	vma->vm_flags |= mm->def_flags;
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 	vma->vm_ops = &vdso_vmops;
@@ -603,8 +603,6 @@ void __init vdso_init(void)
 		ClearPageReserved(pg);
 		get_page(pg);
 	}
-
-	get_page(virt_to_page(systemcfg));
 }
 
 int in_gate_area_no_task(unsigned long addr)
diff --git a/trunk/arch/ppc64/mm/imalloc.c b/trunk/arch/ppc64/mm/imalloc.c
index f4ca29cf5364..c65b87b92756 100644
--- a/trunk/arch/ppc64/mm/imalloc.c
+++ b/trunk/arch/ppc64/mm/imalloc.c
@@ -300,7 +300,12 @@ void im_free(void * addr)
 	for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
+
+			/* XXX: do we need the lock? */
+			spin_lock(&init_mm.page_table_lock);
 			unmap_vm_area(tmp);
+			spin_unlock(&init_mm.page_table_lock);
+
 			kfree(tmp);
 			up(&imlist_sem);
 			return;
diff --git a/trunk/arch/ppc64/mm/init.c b/trunk/arch/ppc64/mm/init.c
index e2bd7776622f..be64b157afce 100644
--- a/trunk/arch/ppc64/mm/init.c
+++ b/trunk/arch/ppc64/mm/init.c
@@ -104,8 +104,6 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -116,7 +114,6 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
-		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%ld pages of RAM\n", total);
 	printk("%ld reserved pages\n", reserved);
@@ -158,6 +155,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 	unsigned long vsid;
 
 	if (mem_init_done) {
+		spin_lock(&init_mm.page_table_lock);
 		pgdp = pgd_offset_k(ea);
 		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
@@ -165,11 +163,12 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
 		if (!pmdp)
 			return -ENOMEM;
-		ptep = pte_alloc_kernel(pmdp, ea);
+		ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
+		spin_unlock(&init_mm.page_table_lock);
 	} else {
 		unsigned long va, vpn, hash, hpteg;
 
@@ -650,14 +649,11 @@ void __init mem_init(void)
 #endif
 
 	for_each_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
-		pgdat_resize_unlock(pgdat, &flags);
 	}
 
 	codesize = (unsigned long)&_etext - (unsigned long)&_stext;
@@ -871,80 +867,3 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	free_cold_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
-/*
- * This works only for the non-NUMA case.  Later, we'll need a lookup
- * to convert from real physical addresses to nid, that doesn't use
- * pfn_to_nid().
- */
-int __devinit add_memory(u64 start, u64 size)
-{
-	struct pglist_data *pgdata = NODE_DATA(0);
-	struct zone *zone;
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-
-	/* this should work for most non-highmem platforms */
-	zone = pgdata->node_zones;
-
-	return __add_pages(zone, start_pfn, nr_pages);
-
-	return 0;
-}
-
-/*
- * First pass at this code will check to determine if the remove
- * request is within the RMO.  Do not allow removal within the RMO.
- */
-int __devinit remove_memory(u64 start, u64 size)
-{
-	struct zone *zone;
-	unsigned long start_pfn, end_pfn, nr_pages;
-
-	start_pfn = start >> PAGE_SHIFT;
-	nr_pages = size >> PAGE_SHIFT;
-	end_pfn = start_pfn + nr_pages;
-
-	printk("%s(): Attempting to remove memoy in range "
-			"%lx to %lx\n", __func__, start, start+size);
-	/*
-	 * check for range within RMO
-	 */
-	zone = page_zone(pfn_to_page(start_pfn));
-
-	printk("%s(): memory will be removed from "
-			"the %s zone\n", __func__, zone->name);
-
-	/*
-	 * not handling removing memory ranges that
-	 * overlap multiple zones yet
-	 */
-	if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
-		goto overlap;
-
-	/* make sure it is NOT in RMO */
-	if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
-		printk("%s(): range to be removed must NOT be in RMO!\n",
-			__func__);
-		goto in_rmo;
-	}
-
-	return __remove_pages(zone, start_pfn, nr_pages);
-
-overlap:
-	printk("%s(): memory range to be removed overlaps "
-		"multiple zones!!!\n", __func__);
-in_rmo:
-	return -1;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/trunk/arch/s390/mm/ioremap.c b/trunk/arch/s390/mm/ioremap.c
index 0f6e9ecbefe2..c6c39d868bc8 100644
--- a/trunk/arch/s390/mm/ioremap.c
+++ b/trunk/arch/s390/mm/ioremap.c
@@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -80,6 +80,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -93,6 +94,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return 0;
 }
diff --git a/trunk/arch/sh/mm/fault.c b/trunk/arch/sh/mm/fault.c
index 775f86cd3fe8..7abba2161da6 100644
--- a/trunk/arch/sh/mm/fault.c
+++ b/trunk/arch/sh/mm/fault.c
@@ -194,13 +194,10 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 			       unsigned long address)
 {
 	unsigned long addrmax = P4SEG;
-	pgd_t *pgd;
+	pgd_t *dir;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
-	struct mm_struct *mm;
-	spinlock_t *ptl;
-	int ret = 1;
 
 #ifdef CONFIG_SH_KGDB
 	if (kgdb_nofault && kgdb_bus_err_hook)
@@ -211,28 +208,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	addrmax = P4SEG_STORE_QUE + 0x04000000;
 #endif
 
-	if (address >= P3SEG && address < addrmax) {
-		pgd = pgd_offset_k(address);
-		mm = NULL;
-	} else if (address >= TASK_SIZE)
+	if (address >= P3SEG && address < addrmax)
+		dir = pgd_offset_k(address);
+	else if (address >= TASK_SIZE)
 		return 1;
-	else if (!(mm = current->mm))
+	else if (!current->mm)
 		return 1;
 	else
-		pgd = pgd_offset(mm, address);
+		dir = pgd_offset(current->mm, address);
 
-	pmd = pmd_offset(pgd, address);
-	if (pmd_none_or_clear_bad(pmd))
+	pmd = pmd_offset(dir, address);
+	if (pmd_none(*pmd))
 		return 1;
-	if (mm)
-		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-	else
-		pte = pte_offset_kernel(pmd, address);
-
+	if (pmd_bad(*pmd)) {
+		pmd_ERROR(*pmd);
+		pmd_clear(pmd);
+		return 1;
+	}
+	pte = pte_offset_kernel(pmd, address);
 	entry = *pte;
 	if (pte_none(entry) || pte_not_present(entry)
 	    || (writeaccess && !pte_write(entry)))
-		goto unlock;
+		return 1;
 
 	if (writeaccess)
 		entry = pte_mkdirty(entry);
@@ -254,11 +251,8 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 
 	set_pte(pte, entry);
 	update_mmu_cache(NULL, address, entry);
-	ret = 0;
-unlock:
-	if (mm)
-		pte_unmap_unlock(pte, ptl);
-	return ret;
+
+	return 0;
 }
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
diff --git a/trunk/arch/sh/mm/hugetlbpage.c b/trunk/arch/sh/mm/hugetlbpage.c
index 6b7a7688c98e..95bb1a6c6060 100644
--- a/trunk/arch/sh/mm/hugetlbpage.c
+++ b/trunk/arch/sh/mm/hugetlbpage.c
@@ -54,6 +54,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
+#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {
diff --git a/trunk/arch/sh/mm/ioremap.c b/trunk/arch/sh/mm/ioremap.c
index e794e27a72f1..9f490c2742f0 100644
--- a/trunk/arch/sh/mm/ioremap.c
+++ b/trunk/arch/sh/mm/ioremap.c
@@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -79,6 +79,7 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -92,6 +93,7 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/trunk/arch/sh64/mm/cache.c b/trunk/arch/sh64/mm/cache.c
index c0c1b21350d8..3b87e25ea773 100644
--- a/trunk/arch/sh64/mm/cache.c
+++ b/trunk/arch/sh64/mm/cache.c
@@ -584,36 +584,32 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr)
 	}
 }
 
-static void sh64_dcache_purge_user_pages(struct mm_struct *mm,
-				unsigned long addr, unsigned long end)
+static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
-	spinlock_t *ptl;
 	unsigned long paddr;
 
-	if (!mm)
-		return; /* No way to find physical address of page */
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_bad(*pgd))
-		return;
-
-	pmd = pmd_offset(pgd, addr);
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
-		return;
-
-	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-	do {
-		entry = *pte;
-		if (pte_none(entry) || !pte_present(entry))
-			continue;
-		paddr = pte_val(entry) & PAGE_MASK;
-		sh64_dcache_purge_coloured_phy_page(paddr, addr);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
+	/* NOTE : all the callers of this have mm->page_table_lock held, so the
+	   following page table traversal is safe even on SMP/pre-emptible. */
+
+	if (!mm) return; /* No way to find physical address of page */
+	pgd = pgd_offset(mm, eaddr);
+	if (pgd_bad(*pgd)) return;
+
+	pmd = pmd_offset(pgd, eaddr);
+	if (pmd_none(*pmd) || pmd_bad(*pmd)) return;
+
+	pte = pte_offset_kernel(pmd, eaddr);
+	entry = *pte;
+	if (pte_none(entry) || !pte_present(entry)) return;
+
+	paddr = pte_val(entry) & PAGE_MASK;
+
+	sh64_dcache_purge_coloured_phy_page(paddr, eaddr);
+
 }
 /****************************************************************************/
 
@@ -672,7 +668,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 	int n_pages;
 
 	n_pages = ((end - start) >> PAGE_SHIFT);
-	if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) {
+	if (n_pages >= 64) {
 #if 1
 		sh64_dcache_purge_all();
 #else
@@ -711,10 +707,20 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 		}
 #endif
 	} else {
-		/* Small range, covered by a single page table page */
-		start &= PAGE_MASK;	/* should already be so */
-		end = PAGE_ALIGN(end);	/* should already be so */
-		sh64_dcache_purge_user_pages(mm, start, end);
+		/* 'Small' range */
+		unsigned long aligned_start;
+		unsigned long eaddr;
+		unsigned long last_page_start;
+
+		aligned_start = start & PAGE_MASK;
+		/* 'end' is 1 byte beyond the end of the range */
+		last_page_start = (end - 1) & PAGE_MASK;
+
+		eaddr = aligned_start;
+		while (eaddr <= last_page_start) {
+			sh64_dcache_purge_user_page(mm, eaddr);
+			eaddr += PAGE_SIZE;
+		}
 	}
 	return;
 }
@@ -874,7 +880,9 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 	   addresses from the user address space specified by mm, after writing
 	   back any dirty data.
 
-	   Note, 'end' is 1 byte beyond the end of the range to flush. */
+	   Note(1), 'end' is 1 byte beyond the end of the range to flush.
+
+	   Note(2), this is called with mm->page_table_lock held.*/
 
 	sh64_dcache_purge_user_range(mm, start, end);
 	sh64_icache_inv_user_page_range(mm, start, end);
@@ -890,7 +898,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned
 	   the I-cache must be searched too in case the page in question is
 	   both writable and being executed from (e.g. stack trampolines.)
 
-	   Note, this is called with pte lock held.
+	   Note(1), this is called with mm->page_table_lock held.
 	   */
 
 	sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);
diff --git a/trunk/arch/sh64/mm/hugetlbpage.c b/trunk/arch/sh64/mm/hugetlbpage.c
index ed6a505b3ee2..dcd9c8a8baf8 100644
--- a/trunk/arch/sh64/mm/hugetlbpage.c
+++ b/trunk/arch/sh64/mm/hugetlbpage.c
@@ -54,31 +54,41 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t entry)
+#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
+
+static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+			 struct page *page, pte_t * page_table, int write_access)
 {
-	int i;
+	unsigned long i;
+	pte_t entry;
+
+	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+
+	if (write_access)
+		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
+						       vma->vm_page_prot)));
+	else
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	entry = pte_mkyoung(entry);
+	mk_pte_huge(entry);
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte_at(mm, addr, ptep, entry);
-		ptep++;
-		addr += PAGE_SIZE;
+		set_pte(page_table, entry);
+		page_table++;
+
 		pte_val(entry) += PAGE_SIZE;
 	}
 }
 
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep)
+pte_t huge_ptep_get_and_clear(pte_t *ptep)
 {
 	pte_t entry;
-	int i;
 
 	entry = *ptep;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		pte_clear(mm, addr, ptep);
-		addr += PAGE_SIZE;
-		ptep++;
+		pte_clear(pte);
+		pte++;
 	}
 
 	return entry;
@@ -96,6 +106,79 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			    struct vm_area_struct *vma)
+{
+	pte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	int i;
+
+	while (addr < end) {
+		dst_pte = huge_pte_alloc(dst, addr);
+		if (!dst_pte)
+			goto nomem;
+		src_pte = huge_pte_offset(src, addr);
+		BUG_ON(!src_pte || pte_none(*src_pte));
+		entry = *src_pte;
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+			set_pte(dst_pte, entry);
+			pte_val(entry) += PAGE_SIZE;
+			dst_pte++;
+		}
+		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+		addr += HPAGE_SIZE;
+	}
+	return 0;
+
+nomem:
+	return -ENOMEM;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			struct page **pages, struct vm_area_struct **vmas,
+			unsigned long *position, int *length, int i)
+{
+	unsigned long vaddr = *position;
+	int remainder = *length;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+
+	while (vaddr < vma->vm_end && remainder) {
+		if (pages) {
+			pte_t *pte;
+			struct page *page;
+
+			pte = huge_pte_offset(mm, vaddr);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			BUG_ON(!pte || pte_none(*pte));
+
+			page = pte_page(*pte);
+
+			WARN_ON(!PageCompound(page));
+
+			get_page(page);
+			pages[i] = page;
+		}
+
+		if (vmas)
+			vmas[i] = vma;
+
+		vaddr += PAGE_SIZE;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
+	return i;
+}
+
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -112,3 +195,84 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	return NULL;
 }
+
+void unmap_hugepage_range(struct vm_area_struct *vma,
+			  unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte;
+	struct page *page;
+	int i;
+
+	BUG_ON(start & (HPAGE_SIZE - 1));
+	BUG_ON(end & (HPAGE_SIZE - 1));
+
+	for (address = start; address < end; address += HPAGE_SIZE) {
+		pte = huge_pte_offset(mm, address);
+		BUG_ON(!pte);
+		if (pte_none(*pte))
+			continue;
+		page = pte_page(*pte);
+		put_page(page);
+		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+			pte_clear(mm, address+(i*PAGE_SIZE), pte);
+			pte++;
+		}
+	}
+	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+	flush_tlb_range(vma, start, end);
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (!pte_none(*pte))
+			continue;
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			/* charge the fs quota first */
+			if (hugetlb_get_quota(mapping)) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			page = alloc_huge_page();
+			if (!page) {
+				hugetlb_put_quota(mapping);
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			if (! ret) {
+				unlock_page(page);
+			} else {
+				hugetlb_put_quota(mapping);
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
diff --git a/trunk/arch/sh64/mm/ioremap.c b/trunk/arch/sh64/mm/ioremap.c
index fb1866fa2c9d..f4003da556bc 100644
--- a/trunk/arch/sh64/mm/ioremap.c
+++ b/trunk/arch/sh64/mm/ioremap.c
@@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 		BUG();
 
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -101,6 +101,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		error = -ENOMEM;
@@ -114,6 +115,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return 0;
 }
diff --git a/trunk/arch/sparc/mm/generic.c b/trunk/arch/sparc/mm/generic.c
index 9604893ffdbd..20ccb957fb77 100644
--- a/trunk/arch/sparc/mm/generic.c
+++ b/trunk/arch/sparc/mm/generic.c
@@ -73,16 +73,14 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
-	/* See comment in mm/memory.c remap_pfn_range */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
-
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
 
+	spin_lock(&mm->page_table_lock);
 	while (from < end) {
-		pmd_t *pmd = pmd_alloc(mm, dir, from);
+		pmd_t *pmd = pmd_alloc(current->mm, dir, from);
 		error = -ENOMEM;
 		if (!pmd)
 			break;
@@ -92,6 +90,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 		from = (from + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
+	spin_unlock(&mm->page_table_lock);
 
 	flush_tlb_range(vma, beg, end);
 	return error;
diff --git a/trunk/arch/sparc64/kernel/binfmt_aout32.c b/trunk/arch/sparc64/kernel/binfmt_aout32.c
index edf52d06b280..b2854ef221d0 100644
--- a/trunk/arch/sparc64/kernel/binfmt_aout32.c
+++ b/trunk/arch/sparc64/kernel/binfmt_aout32.c
@@ -241,6 +241,7 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 
+	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/trunk/arch/sparc64/mm/generic.c b/trunk/arch/sparc64/mm/generic.c
index 112c316e7cd2..c954d91f01d0 100644
--- a/trunk/arch/sparc64/mm/generic.c
+++ b/trunk/arch/sparc64/mm/generic.c
@@ -127,16 +127,14 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
-	/* See comment in mm/memory.c remap_pfn_range */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
-
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
 
+	spin_lock(&mm->page_table_lock);
 	while (from < end) {
-		pud_t *pud = pud_alloc(mm, dir, from);
+		pud_t *pud = pud_alloc(current->mm, dir, from);
 		error = -ENOMEM;
 		if (!pud)
 			break;
@@ -146,7 +144,8 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 		from = (from + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
-
 	flush_tlb_range(vma, beg, end);
+	spin_unlock(&mm->page_table_lock);
+
 	return error;
 }
diff --git a/trunk/arch/sparc64/mm/tlb.c b/trunk/arch/sparc64/mm/tlb.c
index 8b104be4662b..90ca99d0b89c 100644
--- a/trunk/arch/sparc64/mm/tlb.c
+++ b/trunk/arch/sparc64/mm/tlb.c
@@ -18,7 +18,8 @@
 
 /* Heavily inspired by the ppc64 code.  */
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, };
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) =
+	{ NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, };
 
 void flush_tlb_pending(void)
 {
@@ -71,7 +72,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
 
 no_cache_flush:
 
-	if (mp->fullmm)
+	if (mp->tlb_frozen)
 		return;
 
 	nr = mp->tlb_nr;
@@ -96,7 +97,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
 	unsigned long nr = mp->tlb_nr;
 	long s = start, e = end, vpte_base;
 
-	if (mp->fullmm)
+	if (mp->tlb_frozen)
 		return;
 
 	/* If start is greater than end, that is a real problem.  */
diff --git a/trunk/arch/um/include/tlb.h b/trunk/arch/um/include/tlb.h
index 8efc1e0f1b84..45d7da6c3b2c 100644
--- a/trunk/arch/um/include/tlb.h
+++ b/trunk/arch/um/include/tlb.h
@@ -34,6 +34,7 @@ struct host_vm_op {
 	} u;
 };
 
+extern void mprotect_kernel_vm(int w);
 extern void force_flush_all(void);
 extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
                              unsigned long end_addr, int force,
diff --git a/trunk/arch/um/kernel/process_kern.c b/trunk/arch/um/kernel/process_kern.c
index 34b54a3e2132..0d73ceeece72 100644
--- a/trunk/arch/um/kernel/process_kern.c
+++ b/trunk/arch/um/kernel/process_kern.c
@@ -222,7 +222,6 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	pte_t ptent;
 
 	if(task->mm == NULL) 
 		return(ERR_PTR(-EINVAL));
@@ -239,13 +238,12 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
 		return(ERR_PTR(-EINVAL));
 
 	pte = pte_offset_kernel(pmd, addr);
-	ptent = *pte;
-	if(!pte_present(ptent))
+	if(!pte_present(*pte)) 
 		return(ERR_PTR(-EINVAL));
 
 	if(pte_out != NULL)
-		*pte_out = ptent;
-	return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK));
+		*pte_out = *pte;
+	return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK));
 }
 
 char *current_cmd(void)
diff --git a/trunk/arch/um/kernel/skas/mmu.c b/trunk/arch/um/kernel/skas/mmu.c
index 9e5e39cea821..240143b616a2 100644
--- a/trunk/arch/um/kernel/skas/mmu.c
+++ b/trunk/arch/um/kernel/skas/mmu.c
@@ -28,6 +28,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	pmd_t *pmd;
 	pte_t *pte;
 
+	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, proc);
 	pud = pud_alloc(mm, pgd, proc);
 	if (!pud)
@@ -62,6 +63,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	*pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
 	*pte = pte_mkexec(*pte);
 	*pte = pte_wrprotect(*pte);
+	spin_unlock(&mm->page_table_lock);
 	return(0);
 
  out_pmd:
@@ -69,6 +71,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
  out_pte:
 	pmd_free(pmd);
  out:
+	spin_unlock(&mm->page_table_lock);
 	return(-ENOMEM);
 }
 
@@ -144,7 +147,6 @@ void destroy_context_skas(struct mm_struct *mm)
 
 	if(!proc_mm || !ptrace_faultinfo){
 		free_page(mmu->id.stack);
-		pte_lock_deinit(virt_to_page(mmu->last_page_table));
 		pte_free_kernel((pte_t *) mmu->last_page_table);
                 dec_page_state(nr_page_table_pages);
 #ifdef CONFIG_3_LEVEL_PGTABLES
diff --git a/trunk/arch/um/kernel/tt/tlb.c b/trunk/arch/um/kernel/tt/tlb.c
index ae6217c86135..f1d85dbb45b9 100644
--- a/trunk/arch/um/kernel/tt/tlb.c
+++ b/trunk/arch/um/kernel/tt/tlb.c
@@ -74,6 +74,42 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end)
                 atomic_inc(&vmchange_seq);
 }
 
+static void protect_vm_page(unsigned long addr, int w, int must_succeed)
+{
+	int err;
+
+	err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
+	if(err == 0) return;
+	else if((err == -EFAULT) || (err == -ENOMEM)){
+		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+		protect_vm_page(addr, w, 1);
+	}
+	else panic("protect_vm_page : protect failed, errno = %d\n", err);
+}
+
+void mprotect_kernel_vm(int w)
+{
+	struct mm_struct *mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long addr;
+	
+	mm = &init_mm;
+	for(addr = start_vm; addr < end_vm;){
+		pgd = pgd_offset(mm, addr);
+		pud = pud_offset(pgd, addr);
+		pmd = pmd_offset(pud, addr);
+		if(pmd_present(*pmd)){
+			pte = pte_offset_kernel(pmd, addr);
+			if(pte_present(*pte)) protect_vm_page(addr, w, 0);
+			addr += PAGE_SIZE;
+		}
+		else addr += PMD_SIZE;
+	}
+}
+
 void flush_tlb_kernel_vm_tt(void)
 {
         flush_tlb_kernel_range(start_vm, end_vm);
diff --git a/trunk/arch/x86_64/ia32/ia32_aout.c b/trunk/arch/x86_64/ia32/ia32_aout.c
index 93c60f4aa47a..3e6780fa0186 100644
--- a/trunk/arch/x86_64/ia32/ia32_aout.c
+++ b/trunk/arch/x86_64/ia32/ia32_aout.c
@@ -314,6 +314,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
+	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/trunk/arch/x86_64/mm/ioremap.c b/trunk/arch/x86_64/mm/ioremap.c
index ecf7acb5db9b..6972df480d2b 100644
--- a/trunk/arch/x86_64/mm/ioremap.c
+++ b/trunk/arch/x86_64/mm/ioremap.c
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,6 +105,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pud = pud_alloc(&init_mm, pgd, address);
@@ -118,6 +119,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgd++;
 	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
diff --git a/trunk/drivers/acpi/acpi_memhotplug.c b/trunk/drivers/acpi/acpi_memhotplug.c
index 2143609d2936..01a1bd239263 100644
--- a/trunk/drivers/acpi/acpi_memhotplug.c
+++ b/trunk/drivers/acpi/acpi_memhotplug.c
@@ -200,7 +200,8 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 	 * Note: Assume that this function returns zero on success
 	 */
 	result = add_memory(mem_device->start_addr,
-			    (mem_device->end_addr - mem_device->start_addr) + 1);
+			    (mem_device->end_addr - mem_device->start_addr) + 1,
+			    mem_device->read_write_attribute);
 	if (result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
 		mem_device->state = MEMORY_INVALID_STATE;
@@ -258,7 +259,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 	 * Ask the VM to offline this memory range.
 	 * Note: Assume that this function returns zero on success
 	 */
-	result = remove_memory(start, len);
+	result = remove_memory(start, len, attr);
 	if (result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
 		return_VALUE(result);
diff --git a/trunk/drivers/base/Makefile b/trunk/drivers/base/Makefile
index f12898d53078..66d9c4643fc1 100644
--- a/trunk/drivers/base/Makefile
+++ b/trunk/drivers/base/Makefile
@@ -7,7 +7,6 @@ obj-y			:= core.o sys.o bus.o dd.o \
 obj-y			+= power/
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/trunk/drivers/base/init.c b/trunk/drivers/base/init.c
index c648914b9cde..84e604e25c4f 100644
--- a/trunk/drivers/base/init.c
+++ b/trunk/drivers/base/init.c
@@ -9,7 +9,6 @@
 
 #include <linux/device.h>
 #include <linux/init.h>
-#include <linux/memory.h>
 
 #include "base.h"
 
@@ -34,6 +33,5 @@ void __init driver_init(void)
 	platform_bus_init();
 	system_bus_init();
 	cpu_dev_init();
-	memory_dev_init();
 	attribute_container_init();
 }
diff --git a/trunk/drivers/base/memory.c b/trunk/drivers/base/memory.c
deleted file mode 100644
index b7ddd651d664..000000000000
--- a/trunk/drivers/base/memory.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * drivers/base/memory.c - basic Memory class support
- *
- * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
- *            Dave Hansen <haveblue@us.ibm.com>
- *
- * This file provides the necessary infrastructure to represent
- * a SPARSEMEM-memory-model system's physical memory in /sysfs.
- * All arch-independent code that assumes MEMORY_HOTPLUG requires
- * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
- */
-
-#include <linux/sysdev.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>	/* capable() */
-#include <linux/topology.h>
-#include <linux/device.h>
-#include <linux/memory.h>
-#include <linux/kobject.h>
-#include <linux/memory_hotplug.h>
-#include <linux/mm.h>
-#include <asm/atomic.h>
-#include <asm/uaccess.h>
-
-#define MEMORY_CLASS_NAME	"memory"
-
-static struct sysdev_class memory_sysdev_class = {
-	set_kset_name(MEMORY_CLASS_NAME),
-};
-EXPORT_SYMBOL(memory_sysdev_class);
-
-static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
-{
-	return MEMORY_CLASS_NAME;
-}
-
-static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
-			int num_envp, char *buffer, int buffer_size)
-{
-	int retval = 0;
-
-	return retval;
-}
-
-static struct kset_hotplug_ops memory_hotplug_ops = {
-	.name		= memory_hotplug_name,
-	.hotplug	= memory_hotplug,
-};
-
-static struct notifier_block *memory_chain;
-
-static int register_memory_notifier(struct notifier_block *nb)
-{
-        return notifier_chain_register(&memory_chain, nb);
-}
-
-static void unregister_memory_notifier(struct notifier_block *nb)
-{
-        notifier_chain_unregister(&memory_chain, nb);
-}
-
-/*
- * register_memory - Setup a sysfs device for a memory block
- */
-static int
-register_memory(struct memory_block *memory, struct mem_section *section,
-		struct node *root)
-{
-	int error;
-
-	memory->sysdev.cls = &memory_sysdev_class;
-	memory->sysdev.id = __section_nr(section);
-
-	error = sysdev_register(&memory->sysdev);
-
-	if (root && !error)
-		error = sysfs_create_link(&root->sysdev.kobj,
-					  &memory->sysdev.kobj,
-					  kobject_name(&memory->sysdev.kobj));
-
-	return error;
-}
-
-static void
-unregister_memory(struct memory_block *memory, struct mem_section *section,
-		struct node *root)
-{
-	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
-	BUG_ON(memory->sysdev.id != __section_nr(section));
-
-	sysdev_unregister(&memory->sysdev);
-	if (root)
-		sysfs_remove_link(&root->sysdev.kobj,
-				  kobject_name(&memory->sysdev.kobj));
-}
-
-/*
- * use this as the physical section index that this memsection
- * uses.
- */
-
-static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
-{
-	struct memory_block *mem =
-		container_of(dev, struct memory_block, sysdev);
-	return sprintf(buf, "%08lx\n", mem->phys_index);
-}
-
-/*
- * online, offline, going offline, etc.
- */
-static ssize_t show_mem_state(struct sys_device *dev, char *buf)
-{
-	struct memory_block *mem =
-		container_of(dev, struct memory_block, sysdev);
-	ssize_t len = 0;
-
-	/*
-	 * We can probably put these states in a nice little array
-	 * so that they're not open-coded
-	 */
-	switch (mem->state) {
-		case MEM_ONLINE:
-			len = sprintf(buf, "online\n");
-			break;
-		case MEM_OFFLINE:
-			len = sprintf(buf, "offline\n");
-			break;
-		case MEM_GOING_OFFLINE:
-			len = sprintf(buf, "going-offline\n");
-			break;
-		default:
-			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
-					mem->state);
-			WARN_ON(1);
-			break;
-	}
-
-	return len;
-}
-
-static inline int memory_notify(unsigned long val, void *v)
-{
-	return notifier_call_chain(&memory_chain, val, v);
-}
-
-/*
- * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
- * OK to have direct references to sparsemem variables in here.
- */
-static int
-memory_block_action(struct memory_block *mem, unsigned long action)
-{
-	int i;
-	unsigned long psection;
-	unsigned long start_pfn, start_paddr;
-	struct page *first_page;
-	int ret;
-	int old_state = mem->state;
-
-	psection = mem->phys_index;
-	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
-
-	/*
-	 * The probe routines leave the pages reserved, just
-	 * as the bootmem code does.  Make sure they're still
-	 * that way.
-	 */
-	if (action == MEM_ONLINE) {
-		for (i = 0; i < PAGES_PER_SECTION; i++) {
-			if (PageReserved(first_page+i))
-				continue;
-
-			printk(KERN_WARNING "section number %ld page number %d "
-				"not reserved, was it already online? \n",
-				psection, i);
-			return -EBUSY;
-		}
-	}
-
-	switch (action) {
-		case MEM_ONLINE:
-			start_pfn = page_to_pfn(first_page);
-			ret = online_pages(start_pfn, PAGES_PER_SECTION);
-			break;
-		case MEM_OFFLINE:
-			mem->state = MEM_GOING_OFFLINE;
-			memory_notify(MEM_GOING_OFFLINE, NULL);
-			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
-			ret = remove_memory(start_paddr,
-					    PAGES_PER_SECTION << PAGE_SHIFT);
-			if (ret) {
-				mem->state = old_state;
-				break;
-			}
-			memory_notify(MEM_MAPPING_INVALID, NULL);
-			break;
-		default:
-			printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
-					__FUNCTION__, mem, action, action);
-			WARN_ON(1);
-			ret = -EINVAL;
-	}
-	/*
-	 * For now, only notify on successful memory operations
-	 */
-	if (!ret)
-		memory_notify(action, NULL);
-
-	return ret;
-}
-
-static int memory_block_change_state(struct memory_block *mem,
-		unsigned long to_state, unsigned long from_state_req)
-{
-	int ret = 0;
-	down(&mem->state_sem);
-
-	if (mem->state != from_state_req) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = memory_block_action(mem, to_state);
-	if (!ret)
-		mem->state = to_state;
-
-out:
-	up(&mem->state_sem);
-	return ret;
-}
-
-static ssize_t
-store_mem_state(struct sys_device *dev, const char *buf, size_t count)
-{
-	struct memory_block *mem;
-	unsigned int phys_section_nr;
-	int ret = -EINVAL;
-
-	mem = container_of(dev, struct memory_block, sysdev);
-	phys_section_nr = mem->phys_index;
-
-	if (!valid_section_nr(phys_section_nr))
-		goto out;
-
-	if (!strncmp(buf, "online", min((int)count, 6)))
-		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
-	else if(!strncmp(buf, "offline", min((int)count, 7)))
-		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
-out:
-	if (ret)
-		return ret;
-	return count;
-}
-
-/*
- * phys_device is a bad name for this.  What I really want
- * is a way to differentiate between memory ranges that
- * are part of physical devices that constitute
- * a complete removable unit or fru.
- * i.e. do these ranges belong to the same physical device,
- * s.t. if I offline all of these sections I can then
- * remove the physical device?
- */
-static ssize_t show_phys_device(struct sys_device *dev, char *buf)
-{
-	struct memory_block *mem =
-		container_of(dev, struct memory_block, sysdev);
-	return sprintf(buf, "%d\n", mem->phys_device);
-}
-
-static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
-static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
-static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
-
-#define mem_create_simple_file(mem, attr_name)	\
-	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
-#define mem_remove_simple_file(mem, attr_name)	\
-	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
-
-/*
- * Block size attribute stuff
- */
-static ssize_t
-print_block_size(struct class *class, char *buf)
-{
-	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
-}
-
-static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
-
-static int block_size_init(void)
-{
-	sysfs_create_file(&memory_sysdev_class.kset.kobj,
-		&class_attr_block_size_bytes.attr);
-	return 0;
-}
-
-/*
- * Some architectures will have custom drivers to do this, and
- * will not need to do it from userspace.  The fake hot-add code
- * as well as ppc64 will do all of their discovery in userspace
- * and will require this interface.
- */
-#ifdef CONFIG_ARCH_MEMORY_PROBE
-static ssize_t
-memory_probe_store(struct class *class, const char __user *buf, size_t count)
-{
-	u64 phys_addr;
-	int ret;
-
-	phys_addr = simple_strtoull(buf, NULL, 0);
-
-	ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
-
-	if (ret)
-		count = ret;
-
-	return count;
-}
-static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
-
-static int memory_probe_init(void)
-{
-	sysfs_create_file(&memory_sysdev_class.kset.kobj,
-		&class_attr_probe.attr);
-	return 0;
-}
-#else
-#define memory_probe_init(...)	do {} while (0)
-#endif
-
-/*
- * Note that phys_device is optional.  It is here to allow for
- * differentiation between which *physical* devices each
- * section belongs to...
- */
-
-static int add_memory_block(unsigned long node_id, struct mem_section *section,
-		     unsigned long state, int phys_device)
-{
-	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
-	int ret = 0;
-
-	if (!mem)
-		return -ENOMEM;
-
-	mem->phys_index = __section_nr(section);
-	mem->state = state;
-	init_MUTEX(&mem->state_sem);
-	mem->phys_device = phys_device;
-
-	ret = register_memory(mem, section, NULL);
-	if (!ret)
-		ret = mem_create_simple_file(mem, phys_index);
-	if (!ret)
-		ret = mem_create_simple_file(mem, state);
-	if (!ret)
-		ret = mem_create_simple_file(mem, phys_device);
-
-	return ret;
-}
-
-/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all sysdev classes.
- */
-static struct memory_block *find_memory_block(struct mem_section *section)
-{
-	struct kobject *kobj;
-	struct sys_device *sysdev;
-	struct memory_block *mem;
-	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
-
-	/*
-	 * This only works because we know that section == sysdev->id
-	 * slightly redundant with sysdev_register()
-	 */
-	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
-
-	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
-	if (!kobj)
-		return NULL;
-
-	sysdev = container_of(kobj, struct sys_device, kobj);
-	mem = container_of(sysdev, struct memory_block, sysdev);
-
-	return mem;
-}
-
-int remove_memory_block(unsigned long node_id, struct mem_section *section,
-		int phys_device)
-{
-	struct memory_block *mem;
-
-	mem = find_memory_block(section);
-	mem_remove_simple_file(mem, phys_index);
-	mem_remove_simple_file(mem, state);
-	mem_remove_simple_file(mem, phys_device);
-	unregister_memory(mem, section, NULL);
-
-	return 0;
-}
-
-/*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
- */
-int register_new_memory(struct mem_section *section)
-{
-	return add_memory_block(0, section, MEM_OFFLINE, 0);
-}
-
-int unregister_memory_section(struct mem_section *section)
-{
-	if (!valid_section(section))
-		return -EINVAL;
-
-	return remove_memory_block(0, section, 0);
-}
-
-/*
- * Initialize the sysfs support for memory devices...
- */
-int __init memory_dev_init(void)
-{
-	unsigned int i;
-	int ret;
-
-	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
-	ret = sysdev_class_register(&memory_sysdev_class);
-
-	/*
-	 * Create entries for memory sections that were found
-	 * during boot and have been initialized
-	 */
-	for (i = 0; i < NR_MEM_SECTIONS; i++) {
-		if (!valid_section_nr(i))
-			continue;
-		add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
-	}
-
-	memory_probe_init();
-	block_size_init();
-
-	return ret;
-}
diff --git a/trunk/drivers/scsi/sg.c b/trunk/drivers/scsi/sg.c
index 2d30b46806bf..861e51375d70 100644
--- a/trunk/drivers/scsi/sg.c
+++ b/trunk/drivers/scsi/sg.c
@@ -1886,17 +1886,13 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		struct page *page = sgl[i].page;
-
-		/* XXX: just for debug. Remove when PageReserved is removed */
-		BUG_ON(PageReserved(page));
-		if (dirtied)
-			SetPageDirty(page);
-		/* unlock_page(page); */
+		if (dirtied && !PageReserved(sgl[i].page))
+			SetPageDirty(sgl[i].page);
+		/* unlock_page(sgl[i].page); */
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(page);
+		page_cache_release(sgl[i].page);
 	}
 
 	return 0;
diff --git a/trunk/drivers/scsi/st.c b/trunk/drivers/scsi/st.c
index da9766283bd7..5eb54d8019b4 100644
--- a/trunk/drivers/scsi/st.c
+++ b/trunk/drivers/scsi/st.c
@@ -4526,16 +4526,12 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		struct page *page = sgl[i].page;
-
-		/* XXX: just for debug. Remove when PageReserved is removed */
-		BUG_ON(PageReserved(page));
-		if (dirtied)
-			SetPageDirty(page);
+		if (dirtied && !PageReserved(sgl[i].page))
+			SetPageDirty(sgl[i].page);
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(page);
+		page_cache_release(sgl[i].page);
 	}
 
 	return 0;
diff --git a/trunk/fs/afs/file.c b/trunk/fs/afs/file.c
index 4975c9c193dd..0d576987ec67 100644
--- a/trunk/fs/afs/file.c
+++ b/trunk/fs/afs/file.c
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
 		cachefs_uncache_page(vnode->cache, page);
 #endif
 
-		pageio = (struct cachefs_page *) page_private(page);
-		set_page_private(page, 0);
+		pageio = (struct cachefs_page *) page->private;
+		page->private = 0;
 		ClearPagePrivate(page);
 
 		if (pageio)
diff --git a/trunk/fs/binfmt_aout.c b/trunk/fs/binfmt_aout.c
index 72011826f0cb..dd9baabaf016 100644
--- a/trunk/fs/binfmt_aout.c
+++ b/trunk/fs/binfmt_aout.c
@@ -318,6 +318,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
+	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/trunk/fs/binfmt_elf.c b/trunk/fs/binfmt_elf.c
index 918ccc267e41..d4b15576e584 100644
--- a/trunk/fs/binfmt_elf.c
+++ b/trunk/fs/binfmt_elf.c
@@ -773,6 +773,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
+	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
diff --git a/trunk/fs/binfmt_elf_fdpic.c b/trunk/fs/binfmt_elf_fdpic.c
index dda87c4c82a3..134c9c0d1f54 100644
--- a/trunk/fs/binfmt_elf_fdpic.c
+++ b/trunk/fs/binfmt_elf_fdpic.c
@@ -294,7 +294,14 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
 				  &interp_params,
 				  &current->mm->start_stack,
 				  &current->mm->start_brk);
+#endif
+
+	/* do this so that we can load the interpreter, if need be
+	 * - we will change some of these later
+	 */
+	set_mm_counter(current->mm, rss, 0);
 
+#ifdef CONFIG_MMU
 	retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
diff --git a/trunk/fs/binfmt_flat.c b/trunk/fs/binfmt_flat.c
index 9d6625829b99..7974efa107bc 100644
--- a/trunk/fs/binfmt_flat.c
+++ b/trunk/fs/binfmt_flat.c
@@ -650,6 +650,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
+		set_mm_counter(current->mm, rss, 0);
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
diff --git a/trunk/fs/binfmt_som.c b/trunk/fs/binfmt_som.c
index 00a91dc25d16..227a2682d2bf 100644
--- a/trunk/fs/binfmt_som.c
+++ b/trunk/fs/binfmt_som.c
@@ -259,6 +259,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	create_som_tables(bprm);
 
 	current->mm->start_stack = bprm->p;
+	set_mm_counter(current->mm, rss, 0);
 
 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/trunk/fs/buffer.c b/trunk/fs/buffer.c
index 2066e4cb700c..b1667986442f 100644
--- a/trunk/fs/buffer.c
+++ b/trunk/fs/buffer.c
@@ -96,7 +96,7 @@ static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
-	set_page_private(page, 0);
+	page->private = 0;
 	page_cache_release(page);
 }
 
diff --git a/trunk/fs/compat.c b/trunk/fs/compat.c
index 8e71cdbecc7c..a719e158e002 100644
--- a/trunk/fs/compat.c
+++ b/trunk/fs/compat.c
@@ -1490,6 +1490,7 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
+		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/trunk/fs/direct-io.c b/trunk/fs/direct-io.c
index 3931e7f1e6bf..0d06097bc995 100644
--- a/trunk/fs/direct-io.c
+++ b/trunk/fs/direct-io.c
@@ -162,7 +162,6 @@ static int dio_refill_pages(struct dio *dio)
 	up_read(&current->mm->mmap_sem);
 
 	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
-		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
@@ -170,8 +169,7 @@ static int dio_refill_pages(struct dio *dio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		page_cache_get(page);
-		dio->pages[0] = page;
+		dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
 		dio->head = 0;
 		dio->tail = 1;
 		ret = 0;
diff --git a/trunk/fs/exec.c b/trunk/fs/exec.c
index ba73797eb4cb..d2208f7c87db 100644
--- a/trunk/fs/exec.c
+++ b/trunk/fs/exec.c
@@ -309,36 +309,40 @@ void install_arg_page(struct vm_area_struct *vma,
 	pud_t * pud;
 	pmd_t * pmd;
 	pte_t * pte;
-	spinlock_t *ptl;
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto out;
+		goto out_sig;
 
 	flush_dcache_page(page);
 	pgd = pgd_offset(mm, address);
+
+	spin_lock(&mm->page_table_lock);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
+	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
 		goto out;
 	if (!pte_none(*pte)) {
-		pte_unmap_unlock(pte, ptl);
+		pte_unmap(pte);
 		goto out;
 	}
-	inc_mm_counter(mm, anon_rss);
+	inc_mm_counter(mm, rss);
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
-	pte_unmap_unlock(pte, ptl);
+	pte_unmap(pte);
+	spin_unlock(&mm->page_table_lock);
 
 	/* no need for flush_tlb */
 	return;
 out:
+	spin_unlock(&mm->page_table_lock);
+out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, current);
 }
@@ -1203,6 +1207,7 @@ int do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
+		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/trunk/fs/hugetlbfs/inode.c b/trunk/fs/hugetlbfs/inode.c
index e026c807e6b3..3a9b6d179cbd 100644
--- a/trunk/fs/hugetlbfs/inode.c
+++ b/trunk/fs/hugetlbfs/inode.c
@@ -45,58 +45,10 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
 
 int sysctl_hugetlb_shm_group;
 
-static void huge_pagevec_release(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); ++i)
-		put_page(pvec->pages[i]);
-
-	pagevec_reinit(pvec);
-}
-
-/*
- * huge_pages_needed tries to determine the number of new huge pages that
- * will be required to fully populate this VMA.  This will be equal to
- * the size of the VMA in huge pages minus the number of huge pages
- * (covered by this VMA) that are found in the page cache.
- *
- * Result is in bytes to be compatible with is_hugepage_mem_enough()
- */
-unsigned long
-huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	int i;
-	struct pagevec pvec;
-	unsigned long start = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
-	pgoff_t next = vma->vm_pgoff;
-	pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
-
-	pagevec_init(&pvec, 0);
-	while (next < endpg) {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
-			break;
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			if (page->index > next)
-				next = page->index;
-			if (page->index >= endpg)
-				break;
-			next++;
-			hugepages--;
-		}
-		huge_pagevec_release(&pvec);
-	}
-	return hugepages << HPAGE_SHIFT;
-}
-
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
-	unsigned long bytes;
 	loff_t len, vma_len;
 	int ret;
 
@@ -115,10 +67,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 		return -EINVAL;
 
-	bytes = huge_pages_needed(mapping, vma);
-	if (!is_hugepage_mem_enough(bytes))
-		return -ENOMEM;
-
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	down(&inode->i_sem);
@@ -131,8 +79,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	ret = 0;
-	hugetlb_prefault_arch_hook(vma->vm_mm);
+	ret = hugetlb_prefault(mapping, vma);
+	if (ret)
+		goto out;
+
 	if (inode->i_size < len)
 		inode->i_size = len;
 out:
@@ -142,7 +92,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 }
 
 /*
- * Called under down_write(mmap_sem).
+ * Called under down_write(mmap_sem), page_table_lock is not held
  */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -221,6 +171,16 @@ static int hugetlbfs_commit_write(struct file *file,
 	return -EINVAL;
 }
 
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); ++i)
+		put_page(pvec->pages[i]);
+
+	pagevec_reinit(pvec);
+}
+
 static void truncate_huge_page(struct page *page)
 {
 	clear_page_dirty(page);
@@ -264,35 +224,52 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
+
+	hlist_del_init(&inode->i_hash);
+	list_del_init(&inode->i_list);
+	list_del_init(&inode->i_sb_list);
+	inode->i_state |= I_FREEING;
+	inodes_stat.nr_inodes--;
+	spin_unlock(&inode_lock);
+
 	if (inode->i_data.nrpages)
 		truncate_hugepages(&inode->i_data, 0);
+
+	security_inode_delete(inode);
+
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
 	clear_inode(inode);
+	destroy_inode(inode);
 }
 
 static void hugetlbfs_forget_inode(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
+	struct super_block *super_block = inode->i_sb;
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
 
-	if (!hlist_unhashed(&inode->i_hash)) {
-		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
-			list_move(&inode->i_list, &inode_unused);
-		inodes_stat.nr_unused++;
-		if (!sb || (sb->s_flags & MS_ACTIVE)) {
-			spin_unlock(&inode_lock);
-			return;
-		}
-		inode->i_state |= I_WILL_FREE;
+	if (hlist_unhashed(&inode->i_hash))
+		goto out_truncate;
+
+	if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+		list_del(&inode->i_list);
+		list_add(&inode->i_list, &inode_unused);
+	}
+	inodes_stat.nr_unused++;
+	if (!super_block || (super_block->s_flags & MS_ACTIVE)) {
 		spin_unlock(&inode_lock);
-		/*
-		 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
-		 * in our backing_dev_info.
-		 */
-		write_inode_now(inode, 1);
-		spin_lock(&inode_lock);
-		inode->i_state &= ~I_WILL_FREE;
-		inodes_stat.nr_unused--;
-		hlist_del_init(&inode->i_hash);
+		return;
 	}
+
+	/* write_inode_now() ? */
+	inodes_stat.nr_unused--;
+	hlist_del_init(&inode->i_hash);
+out_truncate:
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	inode->i_state |= I_FREEING;
@@ -300,6 +277,13 @@ static void hugetlbfs_forget_inode(struct inode *inode)
 	spin_unlock(&inode_lock);
 	if (inode->i_data.nrpages)
 		truncate_hugepages(&inode->i_data, 0);
+
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -307,7 +291,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
 static void hugetlbfs_drop_inode(struct inode *inode)
 {
 	if (!inode->i_nlink)
-		generic_delete_inode(inode);
+		hugetlbfs_delete_inode(inode);
 	else
 		hugetlbfs_forget_inode(inode);
 }
@@ -324,6 +308,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 
 	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
 		unsigned long h_vm_pgoff;
+		unsigned long v_length;
 		unsigned long v_offset;
 
 		h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -334,8 +319,11 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 		if (h_vm_pgoff >= h_pgoff)
 			v_offset = 0;
 
-		unmap_hugepage_range(vma,
-				vma->vm_start + v_offset, vma->vm_end);
+		v_length = vma->vm_end - vma->vm_start;
+
+		zap_hugepage_range(vma,
+				vma->vm_start + v_offset,
+				v_length - v_offset);
 	}
 }
 
@@ -391,6 +379,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 					gid_t gid, int mode, dev_t dev)
 {
 	struct inode *inode;
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		if (!sbinfo->free_inodes) {
+			spin_unlock(&sbinfo->stat_lock);
+			return NULL;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -532,51 +531,29 @@ static void hugetlbfs_put_super(struct super_block *sb)
 	}
 }
 
-static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
-{
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		if (unlikely(!sbinfo->free_inodes)) {
-			spin_unlock(&sbinfo->stat_lock);
-			return 0;
-		}
-		sbinfo->free_inodes--;
-		spin_unlock(&sbinfo->stat_lock);
-	}
-
-	return 1;
-}
-
-static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
-{
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
-}
-
-
 static kmem_cache_t *hugetlbfs_inode_cachep;
 
 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
 	struct hugetlbfs_inode_info *p;
 
-	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
-		return NULL;
 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
-	if (unlikely(!p)) {
-		hugetlbfs_inc_free_inodes(sbinfo);
+	if (!p)
 		return NULL;
-	}
 	return &p->vfs_inode;
 }
 
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&ei->vfs_inode);
+}
+
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
-	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
@@ -588,16 +565,6 @@ static struct address_space_operations hugetlbfs_aops = {
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
 };
 
-
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
-
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
-}
-
 struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= simple_sync_file,
@@ -625,7 +592,6 @@ static struct super_operations hugetlbfs_ops = {
 	.alloc_inode    = hugetlbfs_alloc_inode,
 	.destroy_inode  = hugetlbfs_destroy_inode,
 	.statfs		= hugetlbfs_statfs,
-	.delete_inode	= hugetlbfs_delete_inode,
 	.drop_inode	= hugetlbfs_drop_inode,
 	.put_super	= hugetlbfs_put_super,
 };
diff --git a/trunk/fs/jfs/jfs_metapage.c b/trunk/fs/jfs/jfs_metapage.c
index 8a53981f9f27..26091a5f88d4 100644
--- a/trunk/fs/jfs/jfs_metapage.c
+++ b/trunk/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
 	atomic_t io_count;
 	struct metapage *mp[MPS_PER_PAGE];
 };
-#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
+#define mp_anchor(page) ((struct meta_anchor *)page->private)
 
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 		if (!a)
 			return -ENOMEM;
 		memset(a, 0, sizeof(struct meta_anchor));
-		set_page_private(page, (unsigned long)a);
+		page->private = (unsigned long)a;
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 	a->mp[index] = NULL;
 	if (--a->mp_count == 0) {
 		kfree(a);
-		set_page_private(page, 0);
+		page->private = 0;
 		ClearPagePrivate(page);
 		kunmap(page);
 	}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 #else
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
-	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
+	return PagePrivate(page) ? (struct metapage *)page->private : NULL;
 }
 
 static inline int insert_metapage(struct page *page, struct metapage *mp)
 {
 	if (mp) {
-		set_page_private(page, (unsigned long)mp);
+		page->private = (unsigned long)mp;
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 
 static inline void remove_metapage(struct page *page, struct metapage *mp)
 {
-	set_page_private(page, 0);
+	page->private = 0;
 	ClearPagePrivate(page);
 	kunmap(page);
 }
diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c
index 3e1239e4b303..d84eecacbeaf 100644
--- a/trunk/fs/proc/array.c
+++ b/trunk/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		jiffies_to_clock_t(it_real_value),
 		start_time,
 		vsize,
-		mm ? get_mm_rss(mm) : 0,
+		mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
 	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c
index d2fa42006d8f..c7ef3e48e35b 100644
--- a/trunk/fs/proc/task_mmu.c
+++ b/trunk/fs/proc/task_mmu.c
@@ -14,41 +14,22 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
-
-	/*
-	 * Note: to minimize their overhead, mm maintains hiwater_vm and
-	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
-	 * collector of these hiwater stats must therefore get total_vm
-	 * and rss too, which will usually be the higher.  Barriers? not
-	 * worth the effort, such snapshots can always be inconsistent.
-	 */
-	hiwater_vm = total_vm = mm->total_vm;
-	if (hiwater_vm < mm->hiwater_vm)
-		hiwater_vm = mm->hiwater_vm;
-	hiwater_rss = total_rss = get_mm_rss(mm);
-	if (hiwater_rss < mm->hiwater_rss)
-		hiwater_rss = mm->hiwater_rss;
 
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
-		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
-		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n",
-		hiwater_vm << (PAGE_SHIFT-10),
-		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		hiwater_rss << (PAGE_SHIFT-10),
-		total_rss << (PAGE_SHIFT-10),
+		get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -63,11 +44,13 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	int rss = get_mm_counter(mm, rss);
+
+	*shared = rss - get_mm_counter(mm, anon_rss);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = rss;
 	return mm->total_vm;
 }
 
@@ -203,14 +186,13 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				struct mem_size_stats *mss)
 {
 	pte_t *pte, ptent;
-	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map(pmd, addr);
 	do {
 		ptent = *pte;
-		if (!pte_present(ptent))
+		if (pte_none(ptent) || !pte_present(ptent))
 			continue;
 
 		mss->resident += PAGE_SIZE;
@@ -231,8 +213,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				mss->private_clean += PAGE_SIZE;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
+	pte_unmap(pte - 1);
+	cond_resched_lock(&vma->vm_mm->page_table_lock);
 }
 
 static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -286,11 +268,17 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
+	struct mm_struct *mm = vma->vm_mm;
 	struct mem_size_stats mss;
 
 	memset(&mss, 0, sizeof mss);
-	if (vma->vm_mm)
+
+	if (mm) {
+		spin_lock(&mm->page_table_lock);
 		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+		spin_unlock(&mm->page_table_lock);
+	}
+
 	return show_map_internal(m, v, &mss);
 }
 
@@ -419,6 +407,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 	for_each_node(i)
 		md->node[i] =0;
 
+	spin_lock(&mm->page_table_lock);
  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
 		page = follow_page(mm, vaddr, 0);
 		if (page) {
@@ -433,8 +422,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 				md->anon++;
 			md->node[page_to_nid(page)]++;
 		}
-		cond_resched();
 	}
+	spin_unlock(&mm->page_table_lock);
 	return md;
 }
 
@@ -480,7 +469,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 		seq_printf(m, " interleave={");
 		first = 1;
 		for_each_node(n) {
-			if (node_isset(n, pol->v.nodes)) {
+			if (test_bit(n, pol->v.nodes)) {
 				if (!first)
 					seq_putc(m,',');
 				else
diff --git a/trunk/fs/xfs/linux-2.6/xfs_buf.c b/trunk/fs/xfs/linux-2.6/xfs_buf.c
index 4cd46abe8434..ba4767c04adf 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_buf.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_buf.c
@@ -181,9 +181,8 @@ set_page_region(
 	size_t		offset,
 	size_t		length)
 {
-	set_page_private(page,
-		page_private(page) | page_region_mask(offset, length));
-	if (page_private(page) == ~0UL)
+	page->private |= page_region_mask(offset, length);
+	if (page->private == ~0UL)
 		SetPageUptodate(page);
 }
 
@@ -195,7 +194,7 @@ test_page_region(
 {
 	unsigned long	mask = page_region_mask(offset, length);
 
-	return (mask && (page_private(page) & mask) == mask);
+	return (mask && (page->private & mask) == mask);
 }
 
 /*
diff --git a/trunk/include/asm-alpha/barrier.h b/trunk/include/asm-alpha/barrier.h
index 681ff581afa5..229c83fe77cb 100644
--- a/trunk/include/asm-alpha/barrier.h
+++ b/trunk/include/asm-alpha/barrier.h
@@ -1,8 +1,6 @@
 #ifndef __BARRIER_H
 #define __BARRIER_H
 
-#include <asm/compiler.h>
-
 #define mb() \
 __asm__ __volatile__("mb": : :"memory")
 
diff --git a/trunk/include/asm-alpha/rwsem.h b/trunk/include/asm-alpha/rwsem.h
index fafdd4f7010a..8e058a67c9a4 100644
--- a/trunk/include/asm-alpha/rwsem.h
+++ b/trunk/include/asm-alpha/rwsem.h
@@ -262,10 +262,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
 #endif
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_RWSEM_H */
diff --git a/trunk/include/asm-arm/tlb.h b/trunk/include/asm-arm/tlb.h
index f49bfb78c221..9bb325c54645 100644
--- a/trunk/include/asm-arm/tlb.h
+++ b/trunk/include/asm-arm/tlb.h
@@ -27,7 +27,11 @@
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
+	unsigned int		freed;
 	unsigned int		fullmm;
+
+	unsigned int		flushes;
+	unsigned int		avoided_flushes;
 };
 
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -35,9 +39,11 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu = smp_processor_id();
+	struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
 
 	tlb->mm = mm;
+	tlb->freed = 0;
 	tlb->fullmm = full_mm_flush;
 
 	return tlb;
@@ -46,13 +52,24 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	struct mm_struct *mm = tlb->mm;
+	unsigned long freed = tlb->freed;
+	int rss = get_mm_counter(mm, rss);
+
+	if (rss < freed)
+		freed = rss;
+	add_mm_counter(mm, rss, -freed);
+
 	if (tlb->fullmm)
-		flush_tlb_mm(tlb->mm);
+		flush_tlb_mm(mm);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+}
 
-	put_cpu_var(mmu_gathers);
+static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
+{
+	return tlb->fullmm;
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
diff --git a/trunk/include/asm-arm26/tlb.h b/trunk/include/asm-arm26/tlb.h
index 08ddd85b8d35..1316352a58f3 100644
--- a/trunk/include/asm-arm26/tlb.h
+++ b/trunk/include/asm-arm26/tlb.h
@@ -10,20 +10,24 @@
  */
 struct mmu_gather {
         struct mm_struct        *mm;
-        unsigned int            need_flush;
-        unsigned int            fullmm;
+        unsigned int            freed;
+	unsigned int            fullmm;
+
+        unsigned int            flushes;
+        unsigned int            avoided_flushes;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+extern struct mmu_gather mmu_gathers[NR_CPUS];
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+        int cpu = smp_processor_id();
+        struct mmu_gather *tlb = &mmu_gathers[cpu];
 
         tlb->mm = mm;
-        tlb->need_flush = 0;
-        tlb->fullmm = full_mm_flush;
+        tlb->freed = 0;
+	tlb->fullmm = full_mm_flush;
 
         return tlb;
 }
@@ -31,13 +35,30 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-        if (tlb->need_flush)
-                flush_tlb_mm(tlb->mm);
+        struct mm_struct *mm = tlb->mm;
+        unsigned long freed = tlb->freed;
+        int rss = get_mm_counter(mm, rss);
+
+        if (rss < freed)
+                freed = rss;
+        add_mm_counter(mm, rss, -freed);
+
+        if (freed) {
+                flush_tlb_mm(mm);
+                tlb->flushes++;
+        } else {
+                tlb->avoided_flushes++;
+        }
 
         /* keep the page table cache within bounds */
         check_pgt_cache();
+}
+
 
-        put_cpu_var(mmu_gathers);
+static inline unsigned int
+tlb_is_full_mm(struct mmu_gather *tlb)
+{
+     return tlb->fullmm;
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)  do { } while (0)
@@ -50,13 +71,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
         } while (0)
 #define tlb_end_vma(tlb,vma)                    do { } while (0)
 
-static inline void
-tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
-        tlb->need_flush = 1;
-        free_page_and_swap_cache(page);
-}
-
+#define tlb_remove_page(tlb,page)       free_page_and_swap_cache(page)
 #define pte_free_tlb(tlb,ptep)          pte_free(ptep)
 #define pmd_free_tlb(tlb,pmdp)          pmd_free(pmdp)
 
diff --git a/trunk/include/asm-generic/4level-fixup.h b/trunk/include/asm-generic/4level-fixup.h
index 68c6fea994d9..c20ec257ecc0 100644
--- a/trunk/include/asm-generic/4level-fixup.h
+++ b/trunk/include/asm-generic/4level-fixup.h
@@ -10,9 +10,14 @@
 
 #define pud_t				pgd_t
 
-#define pmd_alloc(mm, pud, address) \
-	((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
- 		NULL: pmd_offset(pud, address))
+#define pmd_alloc(mm, pud, address)			\
+({	pmd_t *ret;					\
+	if (pgd_none(*pud))				\
+ 		ret = __pmd_alloc(mm, pud, address);	\
+ 	else						\
+		ret = pmd_offset(pud, address);		\
+ 	ret;						\
+})
 
 #define pud_alloc(mm, pgd, address)	(pgd)
 #define pud_offset(pgd, start)		(pgd)
diff --git a/trunk/include/asm-generic/pgtable.h b/trunk/include/asm-generic/pgtable.h
index 7dca30a26c53..ff28c8b31f58 100644
--- a/trunk/include/asm-generic/pgtable.h
+++ b/trunk/include/asm-generic/pgtable.h
@@ -8,7 +8,7 @@
  *  - update the page tables
  *  - inform the TLB about the new one
  *
- * We hold the mm semaphore for reading, and the pte lock.
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock.
  *
  * Note: the old pte is known to not be writable, so we don't need to
  * worry about dirty bits etc getting lost.
diff --git a/trunk/include/asm-generic/tlb.h b/trunk/include/asm-generic/tlb.h
index cdd4145243cd..7d0298347ee7 100644
--- a/trunk/include/asm-generic/tlb.h
+++ b/trunk/include/asm-generic/tlb.h
@@ -35,13 +35,16 @@
 #endif
 
 /* struct mmu_gather is an opaque type used by the mm code for passing around
- * any data needed by arch specific code for tlb_remove_page.
+ * any data needed by arch specific code for tlb_remove_page.  This structure
+ * can be per-CPU or per-MM as the page table lock is held for the duration of
+ * TLB shootdown.
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
+	unsigned long		freed;
 	struct page *		pages[FREE_PTE_NR];
 };
 
@@ -54,7 +57,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
 
 	tlb->mm = mm;
 
@@ -62,6 +65,7 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
 
 	tlb->fullmm = full_mm_flush;
+	tlb->freed = 0;
 
 	return tlb;
 }
@@ -81,17 +85,28 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 /* tlb_finish_mmu
  *	Called at the end of the shootdown operation to free up any resources
- *	that were required.
+ *	that were required.  The page table lock is still held at this point.
  */
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	int freed = tlb->freed;
+	struct mm_struct *mm = tlb->mm;
+	int rss = get_mm_counter(mm, rss);
+
+	if (rss < freed)
+		freed = rss;
+	add_mm_counter(mm, rss, -freed);
 	tlb_flush_mmu(tlb, start, end);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+}
 
-	put_cpu_var(mmu_gathers);
+static inline unsigned int
+tlb_is_full_mm(struct mmu_gather *tlb)
+{
+	return tlb->fullmm;
 }
 
 /* tlb_remove_page
diff --git a/trunk/include/asm-i386/mmzone.h b/trunk/include/asm-i386/mmzone.h
index 620a90641ea8..348fe3a4879d 100644
--- a/trunk/include/asm-i386/mmzone.h
+++ b/trunk/include/asm-i386/mmzone.h
@@ -88,6 +88,12 @@ static inline int pfn_to_nid(unsigned long pfn)
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
 })
 
+#define local_mapnr(kvaddr)						\
+({									\
+	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
+	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
+})
+
 /* XXX: FIXME -- wli */
 #define kern_addr_valid(kaddr)	(0)
 
diff --git a/trunk/include/asm-i386/pgtable.h b/trunk/include/asm-i386/pgtable.h
index 0e3ec809352d..d101ac414f07 100644
--- a/trunk/include/asm-i386/pgtable.h
+++ b/trunk/include/asm-i386/pgtable.h
@@ -203,8 +203,7 @@ extern unsigned long pg0[];
 #define pte_present(x)	((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 
-/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
-#define pmd_none(x)	(!(unsigned long)pmd_val(x))
+#define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
diff --git a/trunk/include/asm-i386/rwsem.h b/trunk/include/asm-i386/rwsem.h
index be4ab859238e..7625a675852f 100644
--- a/trunk/include/asm-i386/rwsem.h
+++ b/trunk/include/asm-i386/rwsem.h
@@ -284,10 +284,5 @@ LOCK_PREFIX	"xadd %0,(%2)"
 	return tmp+delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _I386_RWSEM_H */
diff --git a/trunk/include/asm-ia64/rwsem.h b/trunk/include/asm-ia64/rwsem.h
index 1327c91ea39c..e18b5ab0cb75 100644
--- a/trunk/include/asm-ia64/rwsem.h
+++ b/trunk/include/asm-ia64/rwsem.h
@@ -186,9 +186,4 @@ __downgrade_write (struct rw_semaphore *sem)
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* _ASM_IA64_RWSEM_H */
diff --git a/trunk/include/asm-ia64/tlb.h b/trunk/include/asm-ia64/tlb.h
index 834370b9dea1..3a9a6d1be75c 100644
--- a/trunk/include/asm-ia64/tlb.h
+++ b/trunk/include/asm-ia64/tlb.h
@@ -60,6 +60,7 @@ struct mmu_gather {
 	unsigned int		nr;		/* == ~0U => fast mode */
 	unsigned char		fullmm;		/* non-zero means full mm flush */
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
+	unsigned long		freed;		/* number of pages freed */
 	unsigned long		start_addr;
 	unsigned long		end_addr;
 	struct page 		*pages[FREE_PTE_NR];
@@ -128,7 +129,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e
 static inline struct mmu_gather *
 tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 	/*
@@ -146,17 +147,25 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 	 */
 	tlb->nr = (num_online_cpus() == 1) ? ~0U : 0;
 	tlb->fullmm = full_mm_flush;
+	tlb->freed = 0;
 	tlb->start_addr = ~0UL;
 	return tlb;
 }
 
 /*
  * Called at the end of the shootdown operation to free up any resources that were
- * collected.
+ * collected.  The page table lock is still held at this point.
  */
 static inline void
 tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	unsigned long freed = tlb->freed;
+	struct mm_struct *mm = tlb->mm;
+	unsigned long rss = get_mm_counter(mm, rss);
+
+	if (rss < freed)
+		freed = rss;
+	add_mm_counter(mm, rss, -freed);
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
@@ -165,8 +174,12 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+}
 
-	put_cpu_var(mmu_gathers);
+static inline unsigned int
+tlb_is_full_mm(struct mmu_gather *tlb)
+{
+     return tlb->fullmm;
 }
 
 /*
diff --git a/trunk/include/asm-m32r/mmzone.h b/trunk/include/asm-m32r/mmzone.h
index adc7970a77ec..d58878ec899e 100644
--- a/trunk/include/asm-m32r/mmzone.h
+++ b/trunk/include/asm-m32r/mmzone.h
@@ -21,6 +21,12 @@ extern struct pglist_data *node_data[];
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1;	\
 })
 
+#define local_mapnr(kvaddr)						\
+({									\
+	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
+	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
+})
+
 #define pfn_to_page(pfn)						\
 ({									\
 	unsigned long __pfn = pfn;					\
diff --git a/trunk/include/asm-parisc/cacheflush.h b/trunk/include/asm-parisc/cacheflush.h
index 1bc3c83ee74b..aa592d8c0e39 100644
--- a/trunk/include/asm-parisc/cacheflush.h
+++ b/trunk/include/asm-parisc/cacheflush.h
@@ -100,34 +100,30 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
 
 /* Simple function to work out if we have an existing address translation
  * for a user space vma. */
-static inline int translation_exists(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long pfn)
+static inline pte_t *__translation_exists(struct mm_struct *mm,
+					  unsigned long addr)
 {
-	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
+	pgd_t *pgd = pgd_offset(mm, addr);
 	pmd_t *pmd;
-	pte_t pte;
+	pte_t *pte;
 
 	if(pgd_none(*pgd))
-		return 0;
+		return NULL;
 
 	pmd = pmd_offset(pgd, addr);
 	if(pmd_none(*pmd) || pmd_bad(*pmd))
-		return 0;
+		return NULL;
 
-	/* We cannot take the pte lock here: flush_cache_page is usually
-	 * called with pte lock already held.  Whereas flush_dcache_page
-	 * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
-	 * the vma itself is secure, but the pte might come or go racily.
-	 */
-	pte = *pte_offset_map(pmd, addr);
-	/* But pte_unmap() does nothing on this architecture */
+	pte = pte_offset_map(pmd, addr);
 
-	/* Filter out coincidental file entries and swap entries */
-	if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
-		return 0;
-
-	return pte_pfn(pte) == pfn;
+	/* The PA flush mappings show up as pte_none, but they're
+	 * valid none the less */
+	if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
+		return NULL;
+	return pte;
 }
+#define	translation_exists(vma, addr)	__translation_exists((vma)->vm_mm, addr)
+
 
 /* Private function to flush a page from the cache of a non-current
  * process.  cr25 contains the Page Directory of the current user
@@ -179,8 +175,9 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long
 {
 	BUG_ON(!vma->vm_mm->context);
 
-	if (likely(translation_exists(vma, vmaddr, pfn)))
+	if(likely(translation_exists(vma, vmaddr)))
 		__flush_cache_page(vma, vmaddr);
 
 }
 #endif
+
diff --git a/trunk/include/asm-parisc/mmzone.h b/trunk/include/asm-parisc/mmzone.h
index ae039f4fd711..595d3dce120a 100644
--- a/trunk/include/asm-parisc/mmzone.h
+++ b/trunk/include/asm-parisc/mmzone.h
@@ -27,6 +27,12 @@ extern struct node_map_data node_data[];
 })
 #define node_localnr(pfn, nid)		((pfn) - node_start_pfn(nid))
 
+#define local_mapnr(kvaddr)						\
+({									\
+	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
+	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
+})
+
 #define pfn_to_page(pfn)						\
 ({									\
 	unsigned long __pfn = (pfn);					\
diff --git a/trunk/include/asm-parisc/tlbflush.h b/trunk/include/asm-parisc/tlbflush.h
index e97aa8d1eff5..84af4ab1fe51 100644
--- a/trunk/include/asm-parisc/tlbflush.h
+++ b/trunk/include/asm-parisc/tlbflush.h
@@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	if (npages >= 512)  /* 2MB of space: arbitrary, should be tuned */
 		flush_tlb_all();
 	else {
-		preempt_disable();
+
 		mtsp(vma->vm_mm->context,1);
 		purge_tlb_start();
 		if (split_tlb) {
@@ -102,7 +102,6 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 				pdtlb(start);
 				start += PAGE_SIZE;
 			}
-		preempt_enable();
 		}
 		purge_tlb_end();
 	}
diff --git a/trunk/include/asm-ppc/rwsem.h b/trunk/include/asm-ppc/rwsem.h
index 3501ea72f88c..3e738f483c11 100644
--- a/trunk/include/asm-ppc/rwsem.h
+++ b/trunk/include/asm-ppc/rwsem.h
@@ -168,10 +168,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _PPC_RWSEM_XADD_H */
diff --git a/trunk/include/asm-ppc64/mmzone.h b/trunk/include/asm-ppc64/mmzone.h
index 80a708e7093a..ed473f4b0152 100644
--- a/trunk/include/asm-ppc64/mmzone.h
+++ b/trunk/include/asm-ppc64/mmzone.h
@@ -67,6 +67,9 @@ static inline int pa_to_nid(unsigned long pa)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
+#define local_mapnr(kvaddr) \
+	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) 
+
 #ifdef CONFIG_DISCONTIGMEM
 
 /*
diff --git a/trunk/include/asm-ppc64/pgtable.h b/trunk/include/asm-ppc64/pgtable.h
index 2eb1778a3a15..c83679c9d2b0 100644
--- a/trunk/include/asm-ppc64/pgtable.h
+++ b/trunk/include/asm-ppc64/pgtable.h
@@ -478,12 +478,10 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
-#define pte_ERROR(e) \
-	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
 	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
 #define pud_ERROR(e) \
-	printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
+	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
diff --git a/trunk/include/asm-ppc64/rwsem.h b/trunk/include/asm-ppc64/rwsem.h
index 7a647fae3765..bd5c2f093575 100644
--- a/trunk/include/asm-ppc64/rwsem.h
+++ b/trunk/include/asm-ppc64/rwsem.h
@@ -163,10 +163,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _PPC_RWSEM_XADD_H */
diff --git a/trunk/include/asm-s390/rwsem.h b/trunk/include/asm-s390/rwsem.h
index 0422a085dd56..8c0cebbfc034 100644
--- a/trunk/include/asm-s390/rwsem.h
+++ b/trunk/include/asm-s390/rwsem.h
@@ -351,10 +351,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return new;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _S390_RWSEM_H */
diff --git a/trunk/include/asm-sh/rwsem.h b/trunk/include/asm-sh/rwsem.h
index 0262d3d1e5e0..1be4337f5259 100644
--- a/trunk/include/asm-sh/rwsem.h
+++ b/trunk/include/asm-sh/rwsem.h
@@ -166,10 +166,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_SH_RWSEM_H */
diff --git a/trunk/include/asm-sparc64/rwsem.h b/trunk/include/asm-sparc64/rwsem.h
index cef5e8270421..4568ee4022df 100644
--- a/trunk/include/asm-sparc64/rwsem.h
+++ b/trunk/include/asm-sparc64/rwsem.h
@@ -56,11 +56,6 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 
 #endif /* _SPARC64_RWSEM_H */
diff --git a/trunk/include/asm-sparc64/tlb.h b/trunk/include/asm-sparc64/tlb.h
index 66138d959df5..9baf57db01d2 100644
--- a/trunk/include/asm-sparc64/tlb.h
+++ b/trunk/include/asm-sparc64/tlb.h
@@ -25,8 +25,9 @@ struct mmu_gather {
 	struct mm_struct *mm;
 	unsigned int pages_nr;
 	unsigned int need_flush;
-	unsigned int fullmm;
+	unsigned int tlb_frozen;
 	unsigned int tlb_nr;
+	unsigned long freed;
 	unsigned long vaddrs[TLB_BATCH_NR];
 	struct page *pages[FREE_PTE_NR];
 };
@@ -43,13 +44,14 @@ extern void flush_tlb_pending(void);
 
 static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *mp = &get_cpu_var(mmu_gathers);
+	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
 
 	BUG_ON(mp->tlb_nr);
 
 	mp->mm = mm;
 	mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
-	mp->fullmm = full_mm_flush;
+	mp->tlb_frozen = full_mm_flush;
+	mp->freed = 0;
 
 	return mp;
 }
@@ -76,19 +78,30 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm);
 
 static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end)
 {
+	unsigned long freed = mp->freed;
+	struct mm_struct *mm = mp->mm;
+	unsigned long rss = get_mm_counter(mm, rss);
+
+	if (rss < freed)
+		freed = rss;
+	add_mm_counter(mm, rss, -freed);
+
 	tlb_flush_mmu(mp);
 
-	if (mp->fullmm) {
-		if (CTX_VALID(mp->mm->context))
-			do_flush_tlb_mm(mp->mm);
-		mp->fullmm = 0;
+	if (mp->tlb_frozen) {
+		if (CTX_VALID(mm->context))
+			do_flush_tlb_mm(mm);
+		mp->tlb_frozen = 0;
 	} else
 		flush_tlb_pending();
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+}
 
-	put_cpu_var(mmu_gathers);
+static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp)
+{
+	return mp->tlb_frozen;
 }
 
 static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page)
diff --git a/trunk/include/asm-um/pgtable.h b/trunk/include/asm-um/pgtable.h
index ac64eb955868..616d02b57ea9 100644
--- a/trunk/include/asm-um/pgtable.h
+++ b/trunk/include/asm-um/pgtable.h
@@ -138,7 +138,7 @@ extern unsigned long pg0[1024];
 
 #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
 
-#define pmd_none(x)	(!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
+#define pmd_none(x)	(!(pmd_val(x) & ~_PAGE_NEWPAGE))
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
diff --git a/trunk/include/asm-x86_64/rwsem.h b/trunk/include/asm-x86_64/rwsem.h
index 46077e9c1910..c002175b6e82 100644
--- a/trunk/include/asm-x86_64/rwsem.h
+++ b/trunk/include/asm-x86_64/rwsem.h
@@ -274,10 +274,5 @@ LOCK_PREFIX	"xaddl %0,(%2)"
 	return tmp+delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _X8664_RWSEM_H */
diff --git a/trunk/include/linux/buffer_head.h b/trunk/include/linux/buffer_head.h
index c937d6e65502..88af42f5e04a 100644
--- a/trunk/include/linux/buffer_head.h
+++ b/trunk/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
 	({							\
-		BUG_ON(!PagePrivate(page));			\
-		((struct buffer_head *)page_private(page));	\
+		BUG_ON(!PagePrivate(page));		\
+		((struct buffer_head *)(page)->private);	\
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
 {
 	page_cache_get(page);
 	SetPagePrivate(page);
-	set_page_private(page, (unsigned long)head);
+	page->private = (unsigned long)head;
 }
 
 static inline void get_bh(struct buffer_head *bh)
diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h
index 0cea162b08c0..d664330d900e 100644
--- a/trunk/include/linux/hugetlb.h
+++ b/trunk/include/linux/hugetlb.h
@@ -16,6 +16,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
+void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
@@ -86,6 +87,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
+#define zap_hugepage_range(vma, start, len)	BUG()
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
diff --git a/trunk/include/linux/memory.h b/trunk/include/linux/memory.h
deleted file mode 100644
index 0def328ab5cf..000000000000
--- a/trunk/include/linux/memory.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * include/linux/memory.h - generic memory definition
- *
- * This is mainly for topological representation. We define the
- * basic "struct memory_block" here, which can be embedded in per-arch
- * definitions or NUMA information.
- *
- * Basic handling of the devices is done in drivers/base/memory.c
- * and system devices are handled in drivers/base/sys.c.
- *
- * Memory block are exported via sysfs in the class/memory/devices/
- * directory.
- *
- */
-#ifndef _LINUX_MEMORY_H_
-#define _LINUX_MEMORY_H_
-
-#include <linux/sysdev.h>
-#include <linux/node.h>
-#include <linux/compiler.h>
-
-#include <asm/semaphore.h>
-
-struct memory_block {
-	unsigned long phys_index;
-	unsigned long state;
-	/*
-	 * This serializes all state change requests.  It isn't
-	 * held during creation because the control files are
-	 * created long after the critical areas during
-	 * initialization.
-	 */
-	struct semaphore state_sem;
-	int phys_device;		/* to which fru does this belong? */
-	void *hw;			/* optional pointer to fw/hw data */
-	int (*phys_callback)(struct memory_block *);
-	struct sys_device sysdev;
-};
-
-/* These states are exposed to userspace as text strings in sysfs */
-#define	MEM_ONLINE		(1<<0) /* exposed to userspace */
-#define	MEM_GOING_OFFLINE	(1<<1) /* exposed to userspace */
-#define	MEM_OFFLINE		(1<<2) /* exposed to userspace */
-
-/*
- * All of these states are currently kernel-internal for notifying
- * kernel components and architectures.
- *
- * For MEM_MAPPING_INVALID, all notifier chains with priority >0
- * are called before pfn_to_page() becomes invalid.  The priority=0
- * entry is reserved for the function that actually makes
- * pfn_to_page() stop working.  Any notifiers that want to be called
- * after that should have priority <0.
- */
-#define	MEM_MAPPING_INVALID	(1<<3)
-
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline int memory_dev_init(void)
-{
-	return 0;
-}
-static inline int register_memory_notifier(struct notifier_block *nb)
-{
-	return 0;
-}
-static inline void unregister_memory_notifier(struct notifier_block *nb)
-{
-}
-#else
-extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
-extern int register_new_memory(struct mem_section *);
-extern int unregister_memory_section(struct mem_section *);
-extern int memory_dev_init(void);
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
-
-#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
-
-extern int invalidate_phys_mapping(unsigned long, unsigned long);
-struct notifier_block;
-
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
-
-extern struct sysdev_class memory_sysdev_class;
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-#define hotplug_memory_notifier(fn, pri) {			\
-	static struct notifier_block fn##_mem_nb =		\
-		{ .notifier_call = fn, .priority = pri };	\
-	register_memory_notifier(&fn##_mem_nb);			\
-}
-
-#endif /* _LINUX_MEMORY_H_ */
diff --git a/trunk/include/linux/memory_hotplug.h b/trunk/include/linux/memory_hotplug.h
deleted file mode 100644
index 01f03bc06eff..000000000000
--- a/trunk/include/linux/memory_hotplug.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef __LINUX_MEMORY_HOTPLUG_H
-#define __LINUX_MEMORY_HOTPLUG_H
-
-#include <linux/mmzone.h>
-#include <linux/spinlock.h>
-#include <linux/mmzone.h>
-#include <linux/notifier.h>
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-/*
- * pgdat resizing functions
- */
-static inline
-void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
-{
-	spin_lock_irqsave(&pgdat->node_size_lock, *flags);
-}
-static inline
-void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
-{
-	spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
-}
-static inline
-void pgdat_resize_init(struct pglist_data *pgdat)
-{
-	spin_lock_init(&pgdat->node_size_lock);
-}
-/*
- * Zone resizing functions
- */
-static inline unsigned zone_span_seqbegin(struct zone *zone)
-{
-	return read_seqbegin(&zone->span_seqlock);
-}
-static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
-{
-	return read_seqretry(&zone->span_seqlock, iv);
-}
-static inline void zone_span_writelock(struct zone *zone)
-{
-	write_seqlock(&zone->span_seqlock);
-}
-static inline void zone_span_writeunlock(struct zone *zone)
-{
-	write_sequnlock(&zone->span_seqlock);
-}
-static inline void zone_seqlock_init(struct zone *zone)
-{
-	seqlock_init(&zone->span_seqlock);
-}
-extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
-extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
-extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
-/* need some defines for these for archs that don't support it */
-extern void online_page(struct page *page);
-/* VM interface that may be used by firmware interface */
-extern int add_memory(u64 start, u64 size);
-extern int remove_memory(u64 start, u64 size);
-extern int online_pages(unsigned long, unsigned long);
-
-/* reasonably generic interface to expand the physical pages in a zone  */
-extern int __add_pages(struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages);
-#else /* ! CONFIG_MEMORY_HOTPLUG */
-/*
- * Stub functions for when hotplug is off
- */
-static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
-static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
-static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
-
-static inline unsigned zone_span_seqbegin(struct zone *zone)
-{
-	return 0;
-}
-static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
-{
-	return 0;
-}
-static inline void zone_span_writelock(struct zone *zone) {}
-static inline void zone_span_writeunlock(struct zone *zone) {}
-static inline void zone_seqlock_init(struct zone *zone) {}
-
-static inline int mhp_notimplemented(const char *func)
-{
-	printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
-	dump_stack();
-	return -ENOSYS;
-}
-
-static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages)
-{
-	return mhp_notimplemented(__FUNCTION__);
-}
-#endif /* ! CONFIG_MEMORY_HOTPLUG */
-static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages)
-{
-	printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__);
-	dump_stack();
-	return -ENOSYS;
-}
-#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/trunk/include/linux/mempolicy.h b/trunk/include/linux/mempolicy.h
index 7af8cb836e78..58385ee1c0ac 100644
--- a/trunk/include/linux/mempolicy.h
+++ b/trunk/include/linux/mempolicy.h
@@ -27,10 +27,10 @@
 
 #include <linux/config.h>
 #include <linux/mmzone.h>
+#include <linux/bitmap.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
-#include <linux/nodemask.h>
 
 struct vm_area_struct;
 
@@ -47,7 +47,8 @@ struct vm_area_struct;
  * Locking policy for interlave:
  * In process context there is no locking because only the process accesses
  * its own state. All vma manipulation is somewhat protected by a down_read on
- * mmap_sem.
+ * mmap_sem. For allocating in the interleave policy the page_table_lock
+ * must be also aquired to protect il_next.
  *
  * Freeing policy:
  * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
@@ -62,7 +63,7 @@ struct mempolicy {
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
-		nodemask_t	 nodes;		/* interleave */
+		DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */
 		/* undefined for default */
 	} v;
 };
diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h
index 5c1fb0a2e806..e1649578fb0c 100644
--- a/trunk/include/linux/mm.h
+++ b/trunk/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Pages managed in a special way */
+#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
@@ -226,18 +226,13 @@ struct page {
 					 * to show when page is mapped
 					 * & limit reverse map searches.
 					 */
-	union {
-		unsigned long private;	/* Mapping-private opaque data:
+	unsigned long private;		/* Mapping-private opaque data:
 					 * usually used for buffer_heads
 					 * if PagePrivate set; used for
 					 * swp_entry_t if PageSwapCache
 					 * When page is free, this indicates
 					 * order in the buddy system.
 					 */
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-		spinlock_t ptl;
-#endif
-	} u;
 	struct address_space *mapping;	/* If low bit clear, points to
 					 * inode address_space, or NULL.
 					 * If page mapped as anonymous
@@ -265,9 +260,6 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
-#define page_private(page)		((page)->u.private)
-#define set_page_private(page, v)	((page)->u.private = (v))
-
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -319,17 +311,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-static inline int page_count(struct page *page)
+static inline int page_count(struct page *p)
 {
-	if (PageCompound(page))
-		page = (struct page *)page_private(page);
-	return atomic_read(&page->_count) + 1;
+	if (PageCompound(p))
+		p = (struct page *)p->private;
+	return atomic_read(&(p)->_count) + 1;
 }
 
 static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
-		page = (struct page *)page_private(page);
+		page = (struct page *)page->private;
 	atomic_inc(&page->_count);
 }
 
@@ -346,7 +338,7 @@ static inline void get_page(struct page *page)
 
 static inline void put_page(struct page *page)
 {
-	if (put_page_testzero(page))
+	if (!PageReserved(page) && put_page_testzero(page))
 		__page_cache_release(page);
 }
 
@@ -595,7 +587,7 @@ static inline int PageAnon(struct page *page)
 static inline pgoff_t page_index(struct page *page)
 {
 	if (unlikely(PageSwapCache(page)))
-		return page_private(page);
+		return page->private;
 	return page->index;
 }
 
@@ -690,7 +682,7 @@ struct zap_details {
 
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
+unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
@@ -712,6 +704,10 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
+extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
@@ -727,7 +723,6 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
-void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
@@ -764,83 +759,38 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
-int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
-
+/*
+ * On a two-level or three-level page table, this ends up being trivial. Thus
+ * the inlining and the symmetry break with pte_alloc_map() that does all
+ * of this out-of-line.
+ */
 /*
  * The following ifdef needed to get the 4level-fixup.h header to work.
  * Remove it when 4level-fixup.h has been removed.
  */
-#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
+#ifdef CONFIG_MMU
+#ifndef __ARCH_HAS_4LEVEL_HACK 
 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
-		NULL: pud_offset(pgd, address);
+	if (pgd_none(*pgd))
+		return __pud_alloc(mm, pgd, address);
+	return pud_offset(pgd, address);
 }
 
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
-		NULL: pmd_offset(pud, address);
+	if (pud_none(*pud))
+		return __pmd_alloc(mm, pud, address);
+	return pmd_offset(pud, address);
 }
-#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
-
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-/*
- * We tuck a spinlock to guard each pagetable page into its struct page,
- * at page->private, with BUILD_BUG_ON to make sure that this will not
- * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
- * When freeing, reset page->mapping so free_pages_check won't complain.
- */
-#define __pte_lockptr(page)	&((page)->u.ptl)
-#define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
-} while (0)
-#define pte_lock_deinit(page)	((page)->mapping = NULL)
-#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
-#else
-/*
- * We use mm->page_table_lock to guard all pagetable pages of the mm.
- */
-#define pte_lock_init(page)	do {} while (0)
-#define pte_lock_deinit(page)	do {} while (0)
-#define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
-#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
-
-#define pte_offset_map_lock(mm, pmd, address, ptlp)	\
-({							\
-	spinlock_t *__ptl = pte_lockptr(mm, pmd);	\
-	pte_t *__pte = pte_offset_map(pmd, address);	\
-	*(ptlp) = __ptl;				\
-	spin_lock(__ptl);				\
-	__pte;						\
-})
-
-#define pte_unmap_unlock(pte, ptl)	do {		\
-	spin_unlock(ptl);				\
-	pte_unmap(pte);					\
-} while (0)
-
-#define pte_alloc_map(mm, pmd, address)			\
-	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
-		NULL: pte_offset_map(pmd, address))
-
-#define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
-	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
-		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
-
-#define pte_alloc_kernel(pmd, address)			\
-	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
-		NULL: pte_offset_kernel(pmd, address))
+#endif
+#endif /* CONFIG_MMU */
 
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
-extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
@@ -884,7 +834,6 @@ extern int split_vma(struct mm_struct *,
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
-extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
 extern void exit_mmap(struct mm_struct *);
@@ -945,8 +894,7 @@ void handle_ra_miss(struct address_space *mapping,
 unsigned long max_sane_readahead(unsigned long nr);
 
 /* Do stack extension */
-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+extern int expand_stack(struct vm_area_struct * vma, unsigned long address);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
@@ -969,28 +917,40 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
-struct page *vmalloc_to_page(void *addr);
-unsigned long vmalloc_to_pfn(void *addr);
-int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
-			unsigned long pfn, unsigned long size, pgprot_t);
+extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
 
-struct page *follow_page(struct mm_struct *, unsigned long address,
-			unsigned int foll_flags);
-#define FOLL_WRITE	0x01	/* check pte is writable */
-#define FOLL_TOUCH	0x02	/* mark page accessed */
-#define FOLL_GET	0x04	/* do get_page on page */
-#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
+extern struct page * vmalloc_to_page(void *addr);
+extern unsigned long vmalloc_to_pfn(void *addr);
+extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
+		int write);
+extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
+int remap_pfn_range(struct vm_area_struct *, unsigned long,
+		unsigned long, unsigned long, pgprot_t);
 
 #ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
-static inline void vm_stat_account(struct mm_struct *mm,
+static inline void __vm_stat_account(struct mm_struct *mm,
 			unsigned long flags, struct file *file, long pages)
 {
 }
 #endif /* CONFIG_PROC_FS */
 
+static inline void vm_stat_account(struct vm_area_struct *vma)
+{
+	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							vma_pages(vma));
+}
+
+static inline void vm_stat_unaccount(struct vm_area_struct *vma)
+{
+	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							-vma_pages(vma));
+}
+
+/* update per process rss and vm hiwater data */
+extern void update_mem_hiwater(struct task_struct *tsk);
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/trunk/include/linux/mmzone.h b/trunk/include/linux/mmzone.h
index f5fa3082fd6a..7519eb4191e7 100644
--- a/trunk/include/linux/mmzone.h
+++ b/trunk/include/linux/mmzone.h
@@ -12,7 +12,6 @@
 #include <linux/threads.h>
 #include <linux/numa.h>
 #include <linux/init.h>
-#include <linux/seqlock.h>
 #include <asm/atomic.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -138,10 +137,6 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
-#ifdef CONFIG_MEMORY_HOTPLUG
-	/* see spanned/present_pages for more description */
-	seqlock_t		span_seqlock;
-#endif
 	struct free_area	free_area[MAX_ORDER];
 
 
@@ -225,16 +220,6 @@ struct zone {
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
-	/*
-	 * zone_start_pfn, spanned_pages and present_pages are all
-	 * protected by span_seqlock.  It is a seqlock because it has
-	 * to be read outside of zone->lock, and it is done in the main
-	 * allocator path.  But, it is written quite infrequently.
-	 *
-	 * The lock is declared along with zone->lock because it is
-	 * frequently read in proximity to zone->lock.  It's good to
-	 * give them a chance of being in the same cacheline.
-	 */
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 
@@ -288,16 +273,6 @@ typedef struct pglist_data {
 	struct page *node_mem_map;
 #endif
 	struct bootmem_data *bdata;
-#ifdef CONFIG_MEMORY_HOTPLUG
-	/*
-	 * Must be held any time you expect node_start_pfn, node_present_pages
-	 * or node_spanned_pages stay constant.  Holding this will also
-	 * guarantee that any pfn_valid() stays that way.
-	 *
-	 * Nests above zone->lock and zone->size_seqlock.
-	 */
-	spinlock_t node_size_lock;
-#endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
 	unsigned long node_spanned_pages; /* total size of physical page
@@ -318,8 +293,6 @@ typedef struct pglist_data {
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
-#include <linux/memory_hotplug.h>
-
 extern struct pglist_data *pgdat_list;
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
@@ -536,7 +509,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern int __section_nr(struct mem_section* ms);
 
 /*
  * We use the lower bits of the mem_map pointer to store
diff --git a/trunk/include/linux/rmap.h b/trunk/include/linux/rmap.h
index 35b30e6c8cf8..e80fb7ee6efd 100644
--- a/trunk/include/linux/rmap.h
+++ b/trunk/include/linux/rmap.h
@@ -95,8 +95,8 @@ int try_to_unmap(struct page *);
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
  */
-pte_t *page_check_address(struct page *, struct mm_struct *,
-				unsigned long, spinlock_t **);
+pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
+
 
 /*
  * Used by swapoff to help locate where page is expected in vma.
diff --git a/trunk/include/linux/rwsem-spinlock.h b/trunk/include/linux/rwsem-spinlock.h
index f30f805080ae..b52a2af25f1f 100644
--- a/trunk/include/linux/rwsem-spinlock.h
+++ b/trunk/include/linux/rwsem-spinlock.h
@@ -61,10 +61,5 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem));
 extern void FASTCALL(__up_write(struct rw_semaphore *sem));
 extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->activity != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */
diff --git a/trunk/include/linux/scatterlist.h b/trunk/include/linux/scatterlist.h
index 7f717e95ae37..66ff545552f7 100644
--- a/trunk/include/linux/scatterlist.h
+++ b/trunk/include/linux/scatterlist.h
@@ -1,14 +1,23 @@
 #ifndef _LINUX_SCATTERLIST_H
 #define _LINUX_SCATTERLIST_H
 
-static inline void sg_init_one(struct scatterlist *sg,
-			       u8 *buf, unsigned int buflen)
-{
-	memset(sg, 0, sizeof(*sg));
+#include <asm/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/string.h>
 
+static inline void sg_set_buf(struct scatterlist *sg, void *buf,
+			      unsigned int buflen)
+{
 	sg->page = virt_to_page(buf);
 	sg->offset = offset_in_page(buf);
 	sg->length = buflen;
 }
 
+static inline void sg_init_one(struct scatterlist *sg, void *buf,
+			       unsigned int buflen)
+{
+	memset(sg, 0, sizeof(*sg));
+	sg_set_buf(sg, buf, buflen);
+}
+
 #endif /* _LINUX_SCATTERLIST_H */
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index 1c30bc308ef1..27519df0f987 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -249,36 +249,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#ifdef ATOMIC64_INIT
-#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
-typedef atomic64_t mm_counter_t;
-#else /* !ATOMIC64_INIT */
-/*
- * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
- * that is, at 16TB if using 4kB page size.
- */
-#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
-typedef atomic_t mm_counter_t;
-#endif /* !ATOMIC64_INIT */
-
-#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
@@ -286,20 +256,6 @@ typedef atomic_t mm_counter_t;
 #define dec_mm_counter(mm, member) (mm)->_##member--
 typedef unsigned long mm_counter_t;
 
-#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
-
-#define get_mm_rss(mm)					\
-	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm)	do {			\
-	unsigned long _rss = get_mm_rss(mm);		\
-	if ((mm)->hiwater_rss < _rss)			\
-		(mm)->hiwater_rss = _rss;		\
-} while (0)
-#define update_hiwater_vm(mm)	do {			\
-	if ((mm)->hiwater_vm < (mm)->total_vm)		\
-		(mm)->hiwater_vm = (mm)->total_vm;	\
-} while (0)
-
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -323,20 +279,15 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */
 
-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
-
-	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
-	unsigned long hiwater_vm;	/* High-water virtual memory usage */
-
-	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
-	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
+	unsigned long total_vm, locked_vm, shared_vm;
+	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
+
+	/* Special counters protected by the page_table_lock */
+	mm_counter_t _rss;
+	mm_counter_t _anon_rss;
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
@@ -357,7 +308,11 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
+
 	struct kioctx		default_kioctx;
+
+	unsigned long hiwater_rss;	/* High-water RSS usage */
+	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 };
 
 struct sighand_struct {
diff --git a/trunk/include/linux/vmalloc.h b/trunk/include/linux/vmalloc.h
index 1d5577b2b752..3701a0673d2c 100644
--- a/trunk/include/linux/vmalloc.h
+++ b/trunk/include/linux/vmalloc.h
@@ -32,14 +32,10 @@ struct vm_struct {
  *	Highlevel APIs for driver use
  */
 extern void *vmalloc(unsigned long size);
-extern void *vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
-				pgprot_t prot);
-extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
-				pgprot_t prot, int node);
+extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot);
 extern void vfree(void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
@@ -52,8 +48,6 @@ extern void vunmap(void *addr);
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 					unsigned long start, unsigned long end);
-extern struct vm_struct *get_vm_area_node(unsigned long size,
-					unsigned long flags, int node);
 extern struct vm_struct *remove_vm_area(void *addr);
 extern struct vm_struct *__remove_vm_area(void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
diff --git a/trunk/ipc/shm.c b/trunk/ipc/shm.c
index b58c651d31ae..dca90489e3b0 100644
--- a/trunk/ipc/shm.c
+++ b/trunk/ipc/shm.c
@@ -233,11 +233,10 @@ static int newseg (key_t key, int shmflg, size_t size)
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
 	shp->shm_file = file;
 	file->f_dentry->d_inode->i_ino = shp->id;
-
-	/* Hugetlb ops would have already been assigned. */
-	if (!(shmflg & SHM_HUGETLB))
+	if (shmflg & SHM_HUGETLB)
+		set_file_hugepages(file);
+	else
 		file->f_op = &shm_file_operations;
-
 	shm_tot += numpages;
 	shm_unlock(shp);
 	return shp->id;
diff --git a/trunk/kernel/acct.c b/trunk/kernel/acct.c
index 2e3f4a47e7d0..b756f527497e 100644
--- a/trunk/kernel/acct.c
+++ b/trunk/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c
index 79f52b85d6ed..3b25b182d2be 100644
--- a/trunk/kernel/exit.c
+++ b/trunk/kernel/exit.c
@@ -839,10 +839,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	if (tsk->mm) {
-		update_hiwater_rss(tsk->mm);
-		update_hiwater_vm(tsk->mm);
-	}
+	update_mem_hiwater(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c
index 8a069612eac3..280bd44ac441 100644
--- a/trunk/kernel/fork.c
+++ b/trunk/kernel/fork.c
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 }
 
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 {
-	struct vm_area_struct *mpnt, *tmp, **pprev;
+	struct vm_area_struct * mpnt, *tmp, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
-	flush_cache_mm(oldmm);
-	down_write(&mm->mmap_sem);
-
+	flush_cache_mm(current->mm);
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			long pages = vma_pages(mpnt);
 			mm->total_vm -= pages;
-			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
 			continue;
 		}
@@ -253,8 +253,12 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 
 		/*
-		 * Link in the new vma and copy the page table entries.
+		 * Link in the new vma and copy the page table entries:
+		 * link in first so that swapoff can see swap entries.
+		 * Note that, exceptionally, here the vma is inserted
+		 * without holding mm->mmap_sem.
 		 */
+		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -263,7 +267,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, oldmm, tmp);
+		retval = copy_page_range(mm, current->mm, tmp);
+		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
@@ -272,9 +277,9 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto out;
 	}
 	retval = 0;
+
 out:
-	up_write(&mm->mmap_sem);
-	flush_tlb_mm(oldmm);
+	flush_tlb_mm(current->mm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
 fail_nomem_policy:
@@ -318,8 +323,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
@@ -496,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = get_mm_rss(mm);
+	mm->hiwater_rss = get_mm_counter(mm,rss);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
diff --git a/trunk/kernel/futex.c b/trunk/kernel/futex.c
index 3b4d5ad44cc6..ca05fe6a70b2 100644
--- a/trunk/kernel/futex.c
+++ b/trunk/kernel/futex.c
@@ -205,13 +205,15 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
+	spin_lock(&current->mm->page_table_lock);
+	page = follow_page(mm, uaddr, 0);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		put_page(page);
+		spin_unlock(&current->mm->page_table_lock);
 		return 0;
 	}
+	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
diff --git a/trunk/kernel/kexec.c b/trunk/kernel/kexec.c
index 2c95848fbce8..36c5d9cd4cc1 100644
--- a/trunk/kernel/kexec.c
+++ b/trunk/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
-		set_page_private(pages, order);
+		pages->private = order;
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
 
-	order = page_private(page);
+	order = page->private;
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
diff --git a/trunk/kernel/power/swsusp.c b/trunk/kernel/power/swsusp.c
index 016504ccfccf..10bc5ec496d7 100644
--- a/trunk/kernel/power/swsusp.c
+++ b/trunk/kernel/power/swsusp.c
@@ -578,23 +578,15 @@ static int save_highmem_zone(struct zone *zone)
 			continue;
 		page = pfn_to_page(pfn);
 		/*
-		 * PageReserved results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations.
-		 *
-		 * rvmalloc should not cause this, because all implementations
-		 * appear to always be using vmalloc_32 on architectures with
-		 * highmem. This is a good thing, because we would like to save
-		 * rvmalloc pages.
-		 *
-		 * It appears to be triggered by pages which do not point to
-		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
-		 * which sets PageReserved if the page does not point to valid
-		 * RAM.
-		 *
-		 * XXX: must remove usage of PageReserved!
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
 		 */
-		if (PageReserved(page))
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
 			continue;
+		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
@@ -680,9 +672,10 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (pfn_is_nosave(pfn)) {
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
 		pr_debug("[nosave pfn 0x%lx]", pfn);
 		return 0;
 	}
diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c
index 4f26c544d02c..1e5cafdf4e27 100644
--- a/trunk/kernel/sched.c
+++ b/trunk/kernel/sched.c
@@ -2511,6 +2511,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
+	/* Update rss highwater mark */
+	update_mem_hiwater(p);
 }
 
 /*
diff --git a/trunk/kernel/timer.c b/trunk/kernel/timer.c
index 6a2e5f8dc725..3ba10fa35b60 100644
--- a/trunk/kernel/timer.c
+++ b/trunk/kernel/timer.c
@@ -752,15 +752,6 @@ static void second_overflow(void)
     else
 	time_adj += (time_adj >> 2) + (time_adj >> 5);
 #endif
-#if HZ == 250
-    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-     */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
-#endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig
index 1a4473fcb2ca..391ffc54d136 100644
--- a/trunk/mm/Kconfig
+++ b/trunk/mm/Kconfig
@@ -111,24 +111,3 @@ config SPARSEMEM_STATIC
 config SPARSEMEM_EXTREME
 	def_bool y
 	depends on SPARSEMEM && !SPARSEMEM_STATIC
-
-# eventually, we can have this option just 'select SPARSEMEM'
-config MEMORY_HOTPLUG
-	bool "Allow for memory hot-add"
-	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
-
-comment "Memory hotplug is currently incompatible with Software Suspend"
-	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
-
-# Heavily threaded applications may benefit from splitting the mm-wide
-# page_table_lock, so that faults on different parts of the user address
-# space can be handled with less contention: split it at this NR_CPUS.
-# Default to 4 for wider testing, though 8 might be more appropriate.
-# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
-# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
-#
-config SPLIT_PTLOCK_CPUS
-	int
-	default "4096" if ARM && !CPU_CACHE_VIPT
-	default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
-	default "4"
diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile
index 2fa6d2ca9f28..4cd69e3ce421 100644
--- a/trunk/mm/Makefile
+++ b/trunk/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/trunk/mm/bootmem.c b/trunk/mm/bootmem.c
index e8c567177dcf..a58699b6579e 100644
--- a/trunk/mm/bootmem.c
+++ b/trunk/mm/bootmem.c
@@ -305,7 +305,6 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 				if (j + 16 < BITS_PER_LONG)
 					prefetchw(page + j + 16);
 				__ClearPageReserved(page + j);
-				set_page_count(page + j, 0);
 			}
 			__free_pages(page, order);
 			i += BITS_PER_LONG;
diff --git a/trunk/mm/filemap.c b/trunk/mm/filemap.c
index 768687f1d46b..1c31b2fd2ca5 100644
--- a/trunk/mm/filemap.c
+++ b/trunk/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *
  *  ->mmap_sem
  *    ->i_mmap_lock
- *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
+ *      ->page_table_lock	(various places, mainly in mmap.c)
  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
  *
  *  ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->anon_vma.lock		(vma_adjust)
  *
  *  ->anon_vma.lock
- *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
+ *    ->page_table_lock		(anon_vma_prepare and various)
  *
- *  ->page_table_lock or pte_lock
+ *  ->page_table_lock
  *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
@@ -152,7 +152,7 @@ static int sync_page(void *word)
 	 * in the ->sync_page() methods make essential use of the
 	 * page_mapping(), merely passing the page down to the backing
 	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page_private(page) is
+	 * ignore it for all cases but swap, where only page->private is
 	 * of interest. When page_mapping() does go NULL, the entire
 	 * call stack gracefully ignores the page and returns.
 	 * -- wli
@@ -1520,7 +1520,7 @@ int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
 			page_cache_release(page);
 			return err;
 		}
-	} else if (vma->vm_flags & VM_NONLINEAR) {
+	} else {
 		/* No page was found just because we can't read it in now (being
 		 * here implies nonblock != 0), but the page may exist, so set
 		 * the PTE to fault it in later. */
@@ -1537,7 +1537,6 @@ int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
 
 	return 0;
 }
-EXPORT_SYMBOL(filemap_populate);
 
 struct vm_operations_struct generic_file_vm_ops = {
 	.nopage		= filemap_nopage,
@@ -1556,6 +1555,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 	vma->vm_ops = &generic_file_vm_ops;
 	return 0;
 }
+EXPORT_SYMBOL(filemap_populate);
 
 /*
  * This is for filesystems which do not implement ->writepage.
diff --git a/trunk/mm/filemap_xip.c b/trunk/mm/filemap_xip.c
index 9cf687e4a29a..8c199f537732 100644
--- a/trunk/mm/filemap_xip.c
+++ b/trunk/mm/filemap_xip.c
@@ -174,8 +174,6 @@ __xip_unmap (struct address_space * mapping,
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
-	spinlock_t *ptl;
-	struct page *page;
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -183,17 +181,19 @@ __xip_unmap (struct address_space * mapping,
 		address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-		page = ZERO_PAGE(address);
-		pte = page_check_address(page, mm, address, &ptl);
-		if (pte) {
+		/*
+		 * We need the page_table_lock to protect us from page faults,
+		 * munmap, fork, etc...
+		 */
+		pte = page_check_address(ZERO_PAGE(address), mm,
+					 address);
+		if (!IS_ERR(pte)) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
-			page_remove_rmap(page);
-			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
-			pte_unmap_unlock(pte, ptl);
-			page_cache_release(page);
+			pte_unmap(pte);
+			spin_unlock(&mm->page_table_lock);
 		}
 	}
 	spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
 
 	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page)) {
-		goto out;
+		return page;
 	}
 	if (PTR_ERR(page) != -ENODATA)
 		return NULL;
@@ -249,8 +249,6 @@ xip_file_nopage(struct vm_area_struct * area,
 		page = ZERO_PAGE(address);
 	}
 
-out:
-	page_cache_get(page);
 	return page;
 }
 
diff --git a/trunk/mm/fremap.c b/trunk/mm/fremap.c
index d862be3bc3e3..ab23a0673c35 100644
--- a/trunk/mm/fremap.c
+++ b/trunk/mm/fremap.c
@@ -20,32 +20,33 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
-	struct page *page = NULL;
 
+	if (pte_none(pte))
+		return;
 	if (pte_present(pte)) {
 		unsigned long pfn = pte_pfn(pte);
+
 		flush_cache_page(vma, addr, pfn);
 		pte = ptep_clear_flush(vma, addr, ptep);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, pte, addr);
-			goto out;
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+			if (!PageReserved(page)) {
+				if (pte_dirty(pte))
+					set_page_dirty(page);
+				page_remove_rmap(page);
+				page_cache_release(page);
+				dec_mm_counter(mm, rss);
+			}
 		}
-		page = pfn_to_page(pfn);
-		if (pte_dirty(pte))
-			set_page_dirty(page);
-		page_remove_rmap(page);
-		page_cache_release(page);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
 		pte_clear(mm, addr, ptep);
 	}
-out:
-	return !!page;
 }
 
 /*
@@ -63,20 +64,21 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
-	spinlock_t *ptl;
-
-	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
+	spin_lock(&mm->page_table_lock);
+	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto out;
+		goto err_unlock;
+
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		goto err_unlock;
+
+	pte = pte_alloc_map(mm, pmd, addr);
 	if (!pte)
-		goto out;
+		goto err_unlock;
 
 	/*
 	 * This page may have been truncated. Tell the
@@ -86,27 +88,29 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inode = vma->vm_file->f_mapping->host;
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (!page->mapping || page->index >= size)
-		goto unlock;
+		goto err_unlock;
 	err = -ENOMEM;
 	if (page_mapcount(page) > INT_MAX/2)
-		goto unlock;
+		goto err_unlock;
 
-	if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
-		inc_mm_counter(mm, file_rss);
+	zap_pte(mm, vma, addr, pte);
 
+	inc_mm_counter(mm,rss);
 	flush_icache_page(vma, page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
 	pte_val = *pte;
+	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
+
 	err = 0;
-unlock:
-	pte_unmap_unlock(pte, ptl);
-out:
+err_unlock:
+	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 EXPORT_SYMBOL(install_page);
 
+
 /*
  * Install a file pte to a given virtual memory address, release any
  * previously existing mapping.
@@ -120,35 +124,37 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
-	spinlock_t *ptl;
-
-	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
+	spin_lock(&mm->page_table_lock);
+	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto out;
+		goto err_unlock;
+
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		goto err_unlock;
+
+	pte = pte_alloc_map(mm, pmd, addr);
 	if (!pte)
-		goto out;
+		goto err_unlock;
 
-	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
-		update_hiwater_rss(mm);
-		dec_mm_counter(mm, file_rss);
-	}
+	zap_pte(mm, vma, addr, pte);
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
+	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-	pte_unmap_unlock(pte, ptl);
-	err = 0;
-out:
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+
+err_unlock:
+	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 
+
 /***
  * sys_remap_file_pages - remap arbitrary pages of a shared backing store
  *                        file within an existing vma.
diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c
index c9b43360fd33..61d380678030 100644
--- a/trunk/mm/hugetlb.c
+++ b/trunk/mm/hugetlb.c
@@ -277,23 +277,19 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long addr;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		src_pte = huge_pte_offset(src, addr);
-		if (!src_pte)
-			continue;
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
 			goto nomem;
-		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
-		if (!pte_none(*src_pte)) {
+		src_pte = huge_pte_offset(src, addr);
+		if (src_pte && !pte_none(*src_pte)) {
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
+			add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
-		spin_unlock(&dst->page_table_lock);
 	}
 	return 0;
 
@@ -314,14 +310,12 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
-	spin_lock(&mm->page_table_lock);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
-		if (!ptep)
+		if (! ptep)
+			/* This can happen on truncate, or if an
+			 * mmap() is aborted due to an error before
+			 * the prefault */
 			continue;
 
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -330,99 +324,96 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
+		add_mm_counter(mm, rss,  - (HPAGE_SIZE / PAGE_SIZE));
 	}
+	flush_tlb_range(vma, start, end);
+}
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
 
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
 	spin_unlock(&mm->page_table_lock);
-	flush_tlb_range(vma, start, end);
 }
 
-static struct page *find_lock_huge_page(struct address_space *mapping,
-			unsigned long idx)
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
-	struct page *page;
-	int err;
-	struct inode *inode = mapping->host;
-	unsigned long size;
-
-retry:
-	page = find_lock_page(mapping, idx);
-	if (page)
-		goto out;
-
-	/* Check to make sure the mapping hasn't been truncated */
-	size = i_size_read(inode) >> HPAGE_SHIFT;
-	if (idx >= size)
-		goto out;
-
-	if (hugetlb_get_quota(mapping))
-		goto out;
-	page = alloc_huge_page();
-	if (!page) {
-		hugetlb_put_quota(mapping);
-		goto out;
-	}
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
 
-	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
-	if (err) {
-		put_page(page);
-		hugetlb_put_quota(mapping);
-		if (err == -EEXIST)
-			goto retry;
-		page = NULL;
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	hugetlb_prefault_arch_hook(mm);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			/* charge the fs quota first */
+			if (hugetlb_get_quota(mapping)) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			page = alloc_huge_page();
+			if (!page) {
+				hugetlb_put_quota(mapping);
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			if (! ret) {
+				unlock_page(page);
+			} else {
+				hugetlb_put_quota(mapping);
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
 	}
 out:
-	return page;
+	spin_unlock(&mm->page_table_lock);
+	return ret;
 }
 
+/*
+ * On ia64 at least, it is possible to receive a hugetlb fault from a
+ * stale zero entry left in the TLB from earlier hardware prefetching.
+ * Low-level arch code should already have flushed the stale entry as
+ * part of its fault handling, but we do need to accept this minor fault
+ * and return successfully.  Whereas the "normal" case is that this is
+ * an access to a hugetlb page which has been truncated off since mmap.
+ */
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access)
 {
 	int ret = VM_FAULT_SIGBUS;
-	unsigned long idx;
-	unsigned long size;
 	pte_t *pte;
-	struct page *page;
-	struct address_space *mapping;
-
-	pte = huge_pte_alloc(mm, address);
-	if (!pte)
-		goto out;
-
-	mapping = vma->vm_file->f_mapping;
-	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
-		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-
-	/*
-	 * Use page lock to guard against racing truncation
-	 * before we get page_table_lock.
-	 */
-	page = find_lock_huge_page(mapping, idx);
-	if (!page)
-		goto out;
 
 	spin_lock(&mm->page_table_lock);
-	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
-	if (idx >= size)
-		goto backout;
-
-	ret = VM_FAULT_MINOR;
-	if (!pte_none(*pte))
-		goto backout;
-
-	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+	pte = huge_pte_offset(mm, address);
+	if (pte && !pte_none(*pte))
+		ret = VM_FAULT_MINOR;
 	spin_unlock(&mm->page_table_lock);
-	unlock_page(page);
-out:
 	return ret;
-
-backout:
-	spin_unlock(&mm->page_table_lock);
-	hugetlb_put_quota(mapping);
-	unlock_page(page);
-	put_page(page);
-	goto out;
 }
 
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -432,36 +423,34 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long vpfn, vaddr = *position;
 	int remainder = *length;
 
+	BUG_ON(!is_vm_hugetlb_page(vma));
+
 	vpfn = vaddr/PAGE_SIZE;
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
-		pte_t *pte;
-		struct page *page;
 
-		/*
-		 * Some archs (sparc64, sh*) have multiple pte_ts to
-		 * each hugepage.  We have to make * sure we get the
-		 * first, for the page indexing below to work.
-		 */
-		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
-		if (!pte || pte_none(*pte)) {
-			int ret;
+		if (pages) {
+			pte_t *pte;
+			struct page *page;
+
+			/* Some archs (sparc64, sh*) have multiple
+			 * pte_ts to each hugepage.  We have to make
+			 * sure we get the first, for the page
+			 * indexing below to work. */
+			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+			/* the hugetlb file might have been truncated */
+			if (!pte || pte_none(*pte)) {
+				remainder = 0;
+				if (!i)
+					i = -EFAULT;
+				break;
+			}
 
-			spin_unlock(&mm->page_table_lock);
-			ret = hugetlb_fault(mm, vma, vaddr, 0);
-			spin_lock(&mm->page_table_lock);
-			if (ret == VM_FAULT_MINOR)
-				continue;
+			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
 
-			remainder = 0;
-			if (!i)
-				i = -EFAULT;
-			break;
-		}
+			WARN_ON(!PageCompound(page));
 
-		if (pages) {
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
 			get_page(page);
 			pages[i] = page;
 		}
diff --git a/trunk/mm/madvise.c b/trunk/mm/madvise.c
index 17aaf3e16449..20e075d1c64c 100644
--- a/trunk/mm/madvise.c
+++ b/trunk/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
+	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c
index 0f60baf6f69b..1db40e935e55 100644
--- a/trunk/mm/memory.c
+++ b/trunk/mm/memory.c
@@ -114,7 +114,6 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = pmd_page(*pmd);
 	pmd_clear(pmd);
-	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
 	dec_page_state(nr_page_table_pages);
 	tlb->mm->nr_ptes--;
@@ -250,7 +249,7 @@ void free_pgd_range(struct mmu_gather **tlb,
 		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 
-	if (!(*tlb)->fullmm)
+	if (!tlb_is_full_mm(*tlb))
 		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
 
@@ -261,12 +260,6 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
-		/*
-		 * Hide vma from rmap and vmtruncate before freeing pgtables
-		 */
-		anon_vma_unlink(vma);
-		unlink_file_vma(vma);
-
 		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -279,8 +272,6 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 							HPAGE_SIZE)) {
 				vma = next;
 				next = vma->vm_next;
-				anon_vma_unlink(vma);
-				unlink_file_vma(vma);
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -289,78 +280,72 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 	}
 }
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
 {
-	struct page *new = pte_alloc_one(mm, address);
-	if (!new)
-		return -ENOMEM;
-
-	pte_lock_init(new);
-	spin_lock(&mm->page_table_lock);
-	if (pmd_present(*pmd)) {	/* Another has populated it */
-		pte_lock_deinit(new);
-		pte_free(new);
-	} else {
+	if (!pmd_present(*pmd)) {
+		struct page *new;
+
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free(new);
+			goto out;
+		}
 		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
 	}
-	spin_unlock(&mm->page_table_lock);
-	return 0;
+out:
+	return pte_offset_map(pmd, address);
 }
 
-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
-	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
-	if (!new)
-		return -ENOMEM;
+	if (!pmd_present(*pmd)) {
+		pte_t *new;
 
-	spin_lock(&init_mm.page_table_lock);
-	if (pmd_present(*pmd))		/* Another has populated it */
-		pte_free_kernel(new);
-	else
-		pmd_populate_kernel(&init_mm, pmd, new);
-	spin_unlock(&init_mm.page_table_lock);
-	return 0;
-}
-
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
-{
-	if (file_rss)
-		add_mm_counter(mm, file_rss, file_rss);
-	if (anon_rss)
-		add_mm_counter(mm, anon_rss, anon_rss);
-}
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one_kernel(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
 
-/*
- * This function is called to print an error when a pte in a
- * !VM_RESERVED region is found pointing to an invalid pfn (which
- * is an error.
- *
- * The calling function must still handle the error.
- */
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
-{
-	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
-			"vm_flags = %lx, vaddr = %lx\n",
-		(long long)pte_val(pte),
-		(vma->vm_mm == current->mm ? current->comm : "???"),
-		vma->vm_flags, vaddr);
-	dump_stack();
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free_kernel(new);
+			goto out;
+		}
+		pmd_populate_kernel(mm, pmd, new);
+	}
+out:
+	return pte_offset_kernel(pmd, address);
 }
 
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
+ *
+ * dst->page_table_lock is held on entry and exit,
+ * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
  */
 
 static inline void
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-		unsigned long addr, int *rss)
+		pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+		unsigned long addr)
 {
-	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	unsigned long pfn;
@@ -372,31 +357,28 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
-				if (list_empty(&dst_mm->mmlist))
-					list_add(&dst_mm->mmlist,
-						 &src_mm->mmlist);
+				list_add(&dst_mm->mmlist, &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
 		}
-		goto out_set_pte;
+		set_pte_at(dst_mm, addr, dst_pte, pte);
+		return;
 	}
 
-	/* If the region is VM_RESERVED, the mapping is not
-	 * mapped via rmap - duplicate the pte as is.
-	 */
-	if (vm_flags & VM_RESERVED)
-		goto out_set_pte;
-
 	pfn = pte_pfn(pte);
-	/* If the pte points outside of valid memory but
-	 * the region is not VM_RESERVED, we have a problem.
+	/* the pte points outside of valid memory, the
+	 * mapping is assumed to be good, meaningful
+	 * and not mapped via rmap - duplicate the
+	 * mapping as is.
 	 */
-	if (unlikely(!pfn_valid(pfn))) {
-		print_bad_pte(vma, pte, addr);
-		goto out_set_pte; /* try to do something sane */
-	}
+	page = NULL;
+	if (pfn_valid(pfn))
+		page = pfn_to_page(pfn);
 
-	page = pfn_to_page(pfn);
+	if (!page || PageReserved(page)) {
+		set_pte_at(dst_mm, addr, dst_pte, pte);
+		return;
+	}
 
 	/*
 	 * If it's a COW mapping, write protect it both
@@ -415,11 +397,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	get_page(page);
-	page_dup_rmap(page);
-	rss[!!PageAnon(page)]++;
-
-out_set_pte:
+	inc_mm_counter(dst_mm, rss);
+	if (PageAnon(page))
+		inc_mm_counter(dst_mm, anon_rss);
 	set_pte_at(dst_mm, addr, dst_pte, pte);
+	page_dup_rmap(page);
 }
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -427,44 +409,38 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
-	spinlock_t *src_ptl, *dst_ptl;
-	int progress = 0;
-	int rss[2];
+	unsigned long vm_flags = vma->vm_flags;
+	int progress;
 
 again:
-	rss[1] = rss[0] = 0;
-	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
+	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
-	src_ptl = pte_lockptr(src_mm, src_pmd);
-	spin_lock(src_ptl);
 
+	progress = 0;
+	spin_lock(&src_mm->page_table_lock);
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
 		 * could generate latencies in another task on another CPU.
 		 */
-		if (progress >= 32) {
-			progress = 0;
-			if (need_resched() ||
-			    need_lockbreak(src_ptl) ||
-			    need_lockbreak(dst_ptl))
-				break;
-		}
+		if (progress >= 32 && (need_resched() ||
+		    need_lockbreak(&src_mm->page_table_lock) ||
+		    need_lockbreak(&dst_mm->page_table_lock)))
+			break;
 		if (pte_none(*src_pte)) {
 			progress++;
 			continue;
 		}
-		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+	spin_unlock(&src_mm->page_table_lock);
 
-	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
-	pte_unmap_unlock(dst_pte - 1, dst_ptl);
-	cond_resched();
+	pte_unmap(dst_pte - 1);
+	cond_resched_lock(&dst_mm->page_table_lock);
 	if (addr != end)
 		goto again;
 	return 0;
@@ -549,30 +525,24 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb,
-				struct vm_area_struct *vma, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
-	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
-	spinlock_t *ptl;
-	int file_rss = 0;
-	int anon_rss = 0;
 
-	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	pte = pte_offset_map(pmd, addr);
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent))
 			continue;
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
-			if (!(vma->vm_flags & VM_RESERVED)) {
-				unsigned long pfn = pte_pfn(ptent);
-				if (unlikely(!pfn_valid(pfn)))
-					print_bad_pte(vma, ptent, addr);
-				else
-					page = pfn_to_page(pfn);
+			unsigned long pfn = pte_pfn(ptent);
+			if (pfn_valid(pfn)) {
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					page = NULL;
 			}
 			if (unlikely(details) && page) {
 				/*
@@ -592,7 +562,7 @@ static void zap_pte_range(struct mmu_gather *tlb,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
+			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
@@ -600,17 +570,15 @@ static void zap_pte_range(struct mmu_gather *tlb,
 			if (unlikely(details) && details->nonlinear_vma
 			    && linear_page_index(details->nonlinear_vma,
 						addr) != page->index)
-				set_pte_at(mm, addr, pte,
+				set_pte_at(tlb->mm, addr, pte,
 					   pgoff_to_pte(page->index));
+			if (pte_dirty(ptent))
+				set_page_dirty(page);
 			if (PageAnon(page))
-				anon_rss--;
-			else {
-				if (pte_dirty(ptent))
-					set_page_dirty(page);
-				if (pte_young(ptent))
-					mark_page_accessed(page);
-				file_rss--;
-			}
+				dec_mm_counter(tlb->mm, anon_rss);
+			else if (pte_young(ptent))
+				mark_page_accessed(page);
+			tlb->freed++;
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
 			continue;
@@ -623,15 +591,12 @@ static void zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(mm, addr, pte, tlb->fullmm);
+		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	add_mm_rss(mm, file_rss, anon_rss);
-	pte_unmap_unlock(pte - 1, ptl);
+	pte_unmap(pte - 1);
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb,
-				struct vm_area_struct *vma, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -643,12 +608,11 @@ static inline void zap_pmd_range(struct mmu_gather *tlb,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		zap_pte_range(tlb, vma, pmd, addr, next, details);
+		zap_pte_range(tlb, pmd, addr, next, details);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb,
-				struct vm_area_struct *vma, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -660,7 +624,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		zap_pmd_range(tlb, vma, pud, addr, next, details);
+		zap_pmd_range(tlb, pud, addr, next, details);
 	} while (pud++, addr = next, addr != end);
 }
 
@@ -681,7 +645,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		zap_pud_range(tlb, vma, pgd, addr, next, details);
+		zap_pud_range(tlb, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 }
@@ -696,6 +660,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlbp: address of the caller's struct mmu_gather
+ * @mm: the controlling mm_struct
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
@@ -704,10 +669,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  *
  * Returns the end address of the unmapping (restart addr if interrupted).
  *
- * Unmap all pages in the vma list.
+ * Unmap all pages in the vma list.  Called under page_table_lock.
  *
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * We aim to not hold page_table_lock for too long (for scheduling latency
+ * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
  * return the ending mmu_gather to the caller.
  *
  * Only addresses between `start' and `end' will be unmapped.
@@ -719,7 +684,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
@@ -729,7 +694,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = (*tlbp)->fullmm;
+	int fullmm = tlb_is_full_mm(*tlbp);
 
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
@@ -769,15 +734,19 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
 			if (need_resched() ||
+				need_lockbreak(&mm->page_table_lock) ||
 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
 				if (i_mmap_lock) {
-					*tlbp = NULL;
+					/* must reset count of rss freed */
+					*tlbp = tlb_gather_mmu(mm, fullmm);
 					goto out;
 				}
+				spin_unlock(&mm->page_table_lock);
 				cond_resched();
+				spin_lock(&mm->page_table_lock);
 			}
 
-			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+			*tlbp = tlb_gather_mmu(mm, fullmm);
 			tlb_start_valid = 0;
 			zap_bytes = ZAP_BLOCK_SIZE;
 		}
@@ -801,93 +770,123 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
+	if (is_vm_hugetlb_page(vma)) {
+		zap_hugepage_range(vma, address, size);
+		return end;
+	}
+
 	lru_add_drain();
+	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
-	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-	if (tlb)
-		tlb_finish_mmu(tlb, address, end);
+	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
+	tlb_finish_mmu(tlb, address, end);
+	spin_unlock(&mm->page_table_lock);
 	return end;
 }
 
 /*
  * Do a quick page-table lookup for a single page.
+ * mm->page_table_lock must be held.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
-			unsigned int flags)
+static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
+			int read, int write, int accessed)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
-	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-	if (!IS_ERR(page)) {
-		BUG_ON(flags & FOLL_GET);
-		goto out;
-	}
+	page = follow_huge_addr(mm, address, write);
+	if (! IS_ERR(page))
+		return page;
 
-	page = NULL;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto no_page_table;
+		goto out;
 
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto no_page_table;
+		goto out;
 	
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		goto no_page_table;
-
-	if (pmd_huge(*pmd)) {
-		BUG_ON(flags & FOLL_GET);
-		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
-	}
+	if (pmd_huge(*pmd))
+		return follow_huge_pmd(mm, address, pmd, write);
 
-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	ptep = pte_offset_map(pmd, address);
 	if (!ptep)
 		goto out;
 
 	pte = *ptep;
-	if (!pte_present(pte))
-		goto unlock;
-	if ((flags & FOLL_WRITE) && !pte_write(pte))
-		goto unlock;
-	pfn = pte_pfn(pte);
-	if (!pfn_valid(pfn))
-		goto unlock;
-
-	page = pfn_to_page(pfn);
-	if (flags & FOLL_GET)
-		get_page(page);
-	if (flags & FOLL_TOUCH) {
-		if ((flags & FOLL_WRITE) &&
-		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		mark_page_accessed(page);
+	pte_unmap(ptep);
+	if (pte_present(pte)) {
+		if (write && !pte_write(pte))
+			goto out;
+		if (read && !pte_read(pte))
+			goto out;
+		pfn = pte_pfn(pte);
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (accessed) {
+				if (write && !pte_dirty(pte) &&!PageDirty(page))
+					set_page_dirty(page);
+				mark_page_accessed(page);
+			}
+			return page;
+		}
 	}
-unlock:
-	pte_unmap_unlock(ptep, ptl);
+
 out:
-	return page;
+	return NULL;
+}
 
-no_page_table:
-	/*
-	 * When core dumping an enormous anonymous area that nobody
-	 * has touched so far, we don't want to allocate page tables.
-	 */
-	if (flags & FOLL_ANON) {
-		page = ZERO_PAGE(address);
-		if (flags & FOLL_GET)
-			get_page(page);
-		BUG_ON(flags & FOLL_WRITE);
-	}
-	return page;
+inline struct page *
+follow_page(struct mm_struct *mm, unsigned long address, int write)
+{
+	return __follow_page(mm, address, 0, write, 1);
+}
+
+/*
+ * check_user_page_readable() can be called frm niterrupt context by oprofile,
+ * so we need to avoid taking any non-irq-safe locks
+ */
+int check_user_page_readable(struct mm_struct *mm, unsigned long address)
+{
+	return __follow_page(mm, address, 1, 0, 0) != NULL;
+}
+EXPORT_SYMBOL(check_user_page_readable);
+
+static inline int
+untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
+			 unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	/* Check if the vma is for an anonymous mapping. */
+	if (vma->vm_ops && vma->vm_ops->nopage)
+		return 0;
+
+	/* Check if page directory entry exists. */
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		return 1;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		return 1;
+
+	/* Check if page middle directory entry exists. */
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		return 1;
+
+	/* There is a pte slot for 'address' in 'mm'. */
+	return 0;
 }
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -895,19 +894,18 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int vm_flags;
+	unsigned int flags;
 
 	/* 
 	 * Require read or write permissions.
 	 * If 'force' is set, we only require the "MAY" flags.
 	 */
-	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	i = 0;
 
 	do {
-		struct vm_area_struct *vma;
-		unsigned int foll_flags;
+		struct vm_area_struct *	vma;
 
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(tsk, start)) {
@@ -947,8 +945,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
-				|| !(vm_flags & vma->vm_flags))
+		if (!vma || (vma->vm_flags & VM_IO)
+				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
 		if (is_vm_hugetlb_page(vma)) {
@@ -956,25 +954,29 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 						&start, &len, i);
 			continue;
 		}
-
-		foll_flags = FOLL_TOUCH;
-		if (pages)
-			foll_flags |= FOLL_GET;
-		if (!write && !(vma->vm_flags & VM_LOCKED) &&
-		    (!vma->vm_ops || !vma->vm_ops->nopage))
-			foll_flags |= FOLL_ANON;
-
+		spin_lock(&mm->page_table_lock);
 		do {
+			int write_access = write;
 			struct page *page;
 
-			if (write)
-				foll_flags |= FOLL_WRITE;
-
-			cond_resched();
-			while (!(page = follow_page(mm, start, foll_flags))) {
+			cond_resched_lock(&mm->page_table_lock);
+			while (!(page = follow_page(mm, start, write_access))) {
 				int ret;
-				ret = __handle_mm_fault(mm, vma, start,
-						foll_flags & FOLL_WRITE);
+
+				/*
+				 * Shortcut for anonymous pages. We don't want
+				 * to force the creation of pages tables for
+				 * insanely big anonymously mapped areas that
+				 * nobody touched so far. This is important
+				 * for doing a core dump for these mappings.
+				 */
+				if (!write && untouched_anonymous_page(mm,vma,start)) {
+					page = ZERO_PAGE(start);
+					break;
+				}
+				spin_unlock(&mm->page_table_lock);
+				ret = __handle_mm_fault(mm, vma, start, write_access);
+
 				/*
 				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
 				 * broken COW when necessary, even if maybe_mkwrite
@@ -982,7 +984,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				 * subsequent page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
-					foll_flags &= ~FOLL_WRITE;
+					write_access = 0;
 				
 				switch (ret & ~VM_FAULT_WRITE) {
 				case VM_FAULT_MINOR:
@@ -998,10 +1000,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				default:
 					BUG();
 				}
+				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
+				if (!PageReserved(page))
+					page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1009,6 +1014,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			start += PAGE_SIZE;
 			len--;
 		} while (len && start < vma->vm_end);
+		spin_unlock(&mm->page_table_lock);
 	} while (len);
 	return i;
 }
@@ -1018,21 +1024,16 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end, pgprot_t prot)
 {
 	pte_t *pte;
-	spinlock_t *ptl;
 
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	pte = pte_alloc_map(mm, pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
-		struct page *page = ZERO_PAGE(addr);
-		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
-		page_cache_get(page);
-		page_add_file_rmap(page);
-		inc_mm_counter(mm, file_rss);
+		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
+	pte_unmap(pte - 1);
 	return 0;
 }
 
@@ -1082,12 +1083,14 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = zeromap_pud_range(mm, pgd, addr, next, prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 
@@ -1101,17 +1104,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long pfn, pgprot_t prot)
 {
 	pte_t *pte;
-	spinlock_t *ptl;
 
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	pte = pte_alloc_map(mm, pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
 		BUG_ON(!pte_none(*pte));
-		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
+			set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
+	pte_unmap(pte - 1);
 	return 0;
 }
 
@@ -1170,8 +1173,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out).
+	 *   VM_RESERVED tells swapout not to try to touch
+	 *	this region.
 	 */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 
@@ -1179,6 +1182,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_pud_range(mm, pgd, addr, next,
@@ -1186,35 +1190,11 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
-{
-	int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spinlock_t *ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		same = pte_same(*page_table, orig_pte);
-		spin_unlock(ptl);
-	}
-#endif
-	pte_unmap(page_table);
-	return same;
-}
-
 /*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
  * servicing faults for write access.  In the normal case, do always want
@@ -1228,11 +1208,29 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
+/*
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ */
+static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
+		pte_t *page_table)
+{
+	pte_t entry;
+
+	entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
+			      vma);
+	ptep_establish(vma, address, page_table, entry);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
  * and decrementing the shared-page counter for the old page.
  *
+ * Goto-purists beware: the only reason for goto's here is that it results
+ * in better assembly code.. The "default" path will see no jumps at all.
+ *
  * Note that this routine assumes that the protection checks have been
  * done by the caller (the low-level page fault routine in most cases).
  * Thus we can safely just mark it writable once we've done any necessary
@@ -1242,28 +1240,28 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We hold the mm semaphore and the page_table_lock on entry and exit
+ * with the page_table_lock released.
  */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		spinlock_t *ptl, pte_t orig_pte)
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
+	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
 	struct page *old_page, *new_page;
-	unsigned long pfn = pte_pfn(orig_pte);
+	unsigned long pfn = pte_pfn(pte);
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
-
-	BUG_ON(vma->vm_flags & VM_RESERVED);
+	int ret;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
-		 * Page table corrupted: show pte and kill process.
+		 * This should really halt the system so it can be debugged or
+		 * at least the kernel stops what it's doing before it corrupts
+		 * data, but for the moment just pretend this is OOM.
 		 */
-		print_bad_pte(vma, orig_pte, address);
-		ret = VM_FAULT_OOM;
-		goto unlock;
+		pte_unmap(page_table);
+		printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
+				address);
+		spin_unlock(&mm->page_table_lock);
+		return VM_FAULT_OOM;
 	}
 	old_page = pfn_to_page(pfn);
 
@@ -1272,51 +1270,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unlock_page(old_page);
 		if (reuse) {
 			flush_cache_page(vma, address, pfn);
-			entry = pte_mkyoung(orig_pte);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
+					      vma);
 			ptep_set_access_flags(vma, address, page_table, entry, 1);
 			update_mmu_cache(vma, address, entry);
 			lazy_mmu_prot_update(entry);
-			ret |= VM_FAULT_WRITE;
-			goto unlock;
+			pte_unmap(page_table);
+			spin_unlock(&mm->page_table_lock);
+			return VM_FAULT_MINOR|VM_FAULT_WRITE;
 		}
 	}
+	pte_unmap(page_table);
 
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-	page_cache_get(old_page);
-	pte_unmap_unlock(page_table, ptl);
+	if (!PageReserved(old_page))
+		page_cache_get(old_page);
+	spin_unlock(&mm->page_table_lock);
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto oom;
+		goto no_new_page;
 	if (old_page == ZERO_PAGE(address)) {
 		new_page = alloc_zeroed_user_highpage(vma, address);
 		if (!new_page)
-			goto oom;
+			goto no_new_page;
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
-			goto oom;
+			goto no_new_page;
 		copy_user_highpage(new_page, old_page, address);
 	}
-
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (likely(pte_same(*page_table, orig_pte))) {
-		page_remove_rmap(old_page);
-		if (!PageAnon(old_page)) {
-			inc_mm_counter(mm, anon_rss);
-			dec_mm_counter(mm, file_rss);
-		}
+	ret = VM_FAULT_MINOR;
+	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
+	if (likely(pte_same(*page_table, pte))) {
+		if (PageAnon(old_page))
+			dec_mm_counter(mm, anon_rss);
+		if (PageReserved(old_page))
+			inc_mm_counter(mm, rss);
+		else
+			page_remove_rmap(old_page);
 		flush_cache_page(vma, address, pfn);
-		entry = mk_pte(new_page, vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		ptep_establish(vma, address, page_table, entry);
-		update_mmu_cache(vma, address, entry);
-		lazy_mmu_prot_update(entry);
+		break_cow(vma, new_page, address, page_table);
 		lru_cache_add_active(new_page);
 		page_add_anon_rmap(new_page, vma, address);
 
@@ -1324,12 +1323,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
 	}
+	pte_unmap(page_table);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
-unlock:
-	pte_unmap_unlock(page_table, ptl);
+	spin_unlock(&mm->page_table_lock);
 	return ret;
-oom:
+
+no_new_page:
 	page_cache_release(old_page);
 	return VM_FAULT_OOM;
 }
@@ -1399,6 +1399,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
+
+	/*
+	 * We cannot rely on the break test in unmap_vmas:
+	 * on the one hand, we don't want to restart our loop
+	 * just because that broke out for the page_table_lock;
+	 * on the other hand, it does no test when vma is small.
+	 */
 	need_break = need_resched() ||
 			need_lockbreak(details->i_mmap_lock);
 
@@ -1647,37 +1654,38 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
 }
 
 /*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We hold the mm semaphore and the page_table_lock on entry and
+ * should release the pagetable lock on exit..
  */
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pte_t orig_pte)
+static int do_swap_page(struct mm_struct * mm,
+	struct vm_area_struct * vma, unsigned long address,
+	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
 {
-	spinlock_t *ptl;
 	struct page *page;
-	swp_entry_t entry;
+	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		goto out;
-
-	entry = pte_to_swp_entry(orig_pte);
+	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
  		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
 			/*
-			 * Back out if somebody else faulted in this pte
-			 * while we released the pte lock.
+			 * Back out if somebody else faulted in this pte while
+			 * we released the page table lock.
 			 */
-			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
-			goto unlock;
+			else
+				ret = VM_FAULT_MINOR;
+			pte_unmap(page_table);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
 		}
 
 		/* Had to read the page from swap area: Major fault */
@@ -1690,11 +1698,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lock_page(page);
 
 	/*
-	 * Back out if somebody else already faulted in this pte.
+	 * Back out if somebody else faulted in this pte while we
+	 * released the page table lock.
 	 */
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (unlikely(!pte_same(*page_table, orig_pte)))
+	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
+	if (unlikely(!pte_same(*page_table, orig_pte))) {
+		ret = VM_FAULT_MINOR;
 		goto out_nomap;
+	}
 
 	if (unlikely(!PageUptodate(page))) {
 		ret = VM_FAULT_SIGBUS;
@@ -1703,7 +1715,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	/* The page isn't present yet, go ahead with the fault. */
 
-	inc_mm_counter(mm, anon_rss);
+	inc_mm_counter(mm, rss);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1721,7 +1733,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (write_access) {
 		if (do_wp_page(mm, vma, address,
-				page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+				page_table, pmd, pte) == VM_FAULT_OOM)
 			ret = VM_FAULT_OOM;
 		goto out;
 	}
@@ -1729,76 +1741,74 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
 	lazy_mmu_prot_update(pte);
-unlock:
-	pte_unmap_unlock(page_table, ptl);
+	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
 out:
 	return ret;
 out_nomap:
-	pte_unmap_unlock(page_table, ptl);
+	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 	page_cache_release(page);
-	return ret;
+	goto out;
 }
 
 /*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We are called with the MM semaphore and page_table_lock
+ * spinlock held to protect against concurrent faults in
+ * multithreaded programs. 
  */
-static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access)
+static int
+do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		pte_t *page_table, pmd_t *pmd, int write_access,
+		unsigned long addr)
 {
-	struct page *page;
-	spinlock_t *ptl;
 	pte_t entry;
+	struct page * page = ZERO_PAGE(addr);
+
+	/* Read-only mapping of ZERO_PAGE. */
+	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 
+	/* ..except if it's a write access */
 	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
+		spin_unlock(&mm->page_table_lock);
 
 		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_zeroed_user_highpage(vma, address);
+			goto no_mem;
+		page = alloc_zeroed_user_highpage(vma, addr);
 		if (!page)
-			goto oom;
+			goto no_mem;
 
-		entry = mk_pte(page, vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		spin_lock(&mm->page_table_lock);
+		page_table = pte_offset_map(pmd, addr);
 
-		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-		if (!pte_none(*page_table))
-			goto release;
-		inc_mm_counter(mm, anon_rss);
+		if (!pte_none(*page_table)) {
+			pte_unmap(page_table);
+			page_cache_release(page);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
+		inc_mm_counter(mm, rss);
+		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
+							 vma->vm_page_prot)),
+				      vma);
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
-		page_add_anon_rmap(page, vma, address);
-	} else {
-		/* Map the ZERO_PAGE - vm_page_prot is readonly */
-		page = ZERO_PAGE(address);
-		page_cache_get(page);
-		entry = mk_pte(page, vma->vm_page_prot);
-
-		ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		if (!pte_none(*page_table))
-			goto release;
-		inc_mm_counter(mm, file_rss);
-		page_add_file_rmap(page);
+		page_add_anon_rmap(page, vma, addr);
 	}
 
-	set_pte_at(mm, address, page_table, entry);
+	set_pte_at(mm, addr, page_table, entry);
+	pte_unmap(page_table);
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(vma, address, entry);
+	update_mmu_cache(vma, addr, entry);
 	lazy_mmu_prot_update(entry);
-unlock:
-	pte_unmap_unlock(page_table, ptl);
+	spin_unlock(&mm->page_table_lock);
+out:
 	return VM_FAULT_MINOR;
-release:
-	page_cache_release(page);
-	goto unlock;
-oom:
+no_mem:
 	return VM_FAULT_OOM;
 }
 
@@ -1811,23 +1821,25 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This is called with the MM semaphore held and the page table
+ * spinlock held. Exit with the spinlock released.
  */
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access)
+static int
+do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
-	spinlock_t *ptl;
-	struct page *new_page;
+	struct page * new_page;
 	struct address_space *mapping = NULL;
 	pte_t entry;
 	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
 
+	if (!vma->vm_ops || !vma->vm_ops->nopage)
+		return do_anonymous_page(mm, vma, page_table,
+					pmd, write_access, address);
 	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
 
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
@@ -1835,6 +1847,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		smp_rmb(); /* serializes i_size against truncate_count */
 	}
 retry:
+	cond_resched();
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 	/*
 	 * No smp_rmb is needed here as long as there's a full
@@ -1867,20 +1880,19 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		anon = 1;
 	}
 
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	spin_lock(&mm->page_table_lock);
 	/*
 	 * For a file-backed vma, someone could have truncated or otherwise
 	 * invalidated this page.  If unmap_mapping_range got called,
 	 * retry getting the page.
 	 */
 	if (mapping && unlikely(sequence != mapping->truncate_count)) {
-		pte_unmap_unlock(page_table, ptl);
-		page_cache_release(new_page);
-		cond_resched();
 		sequence = mapping->truncate_count;
-		smp_rmb();
+		spin_unlock(&mm->page_table_lock);
+		page_cache_release(new_page);
 		goto retry;
 	}
+	page_table = pte_offset_map(pmd, address);
 
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -1894,67 +1906,68 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
+		if (!PageReserved(new_page))
+			inc_mm_counter(mm, rss);
+
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!(vma->vm_flags & VM_RESERVED)) {
-			inc_mm_counter(mm, file_rss);
+		} else
 			page_add_file_rmap(new_page);
-		}
+		pte_unmap(page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
+		pte_unmap(page_table);
 		page_cache_release(new_page);
-		goto unlock;
+		spin_unlock(&mm->page_table_lock);
+		goto out;
 	}
 
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
-unlock:
-	pte_unmap_unlock(page_table, ptl);
+	spin_unlock(&mm->page_table_lock);
+out:
 	return ret;
 oom:
 	page_cache_release(new_page);
-	return VM_FAULT_OOM;
+	ret = VM_FAULT_OOM;
+	goto out;
 }
 
 /*
  * Fault of a previously existing named mapping. Repopulate the pte
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pte_t orig_pte)
+static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
+	unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
 {
-	pgoff_t pgoff;
+	unsigned long pgoff;
 	int err;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return VM_FAULT_MINOR;
-
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 */
-		print_bad_pte(vma, orig_pte, address);
-		return VM_FAULT_OOM;
+	BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
+	/*
+	 * Fall back to the linear mapping if the fs does not support
+	 * ->populate:
+	 */
+	if (!vma->vm_ops->populate ||
+			(write_access && !(vma->vm_flags & VM_SHARED))) {
+		pte_clear(mm, address, pte);
+		return do_no_page(mm, vma, address, write_access, pte, pmd);
 	}
-	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
 
-	pgoff = pte_to_pgoff(orig_pte);
-	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
-					vma->vm_page_prot, pgoff, 0);
+	pgoff = pte_to_pgoff(*pte);
+
+	pte_unmap(pte);
+	spin_unlock(&mm->page_table_lock);
+
+	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
 	if (err == -ENOMEM)
 		return VM_FAULT_OOM;
 	if (err)
@@ -1971,68 +1984,56 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * Note the "page_table_lock". It is to protect against kswapd removing
+ * pages from under us. Note that kswapd only ever _removes_ pages, never
+ * adds them. As such, once we have noticed that the page is not present,
+ * we can drop the lock early.
+ *
+ * The adding of pages is protected by the MM semaphore (which we hold),
+ * so we don't need to worry about a page being suddenly been added into
+ * our VM.
+ *
+ * We enter with the pagetable spinlock held, we are supposed to
+ * release it when done.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
-		struct vm_area_struct *vma, unsigned long address,
-		pte_t *pte, pmd_t *pmd, int write_access)
+	struct vm_area_struct * vma, unsigned long address,
+	int write_access, pte_t *pte, pmd_t *pmd)
 {
 	pte_t entry;
-	pte_t old_entry;
-	spinlock_t *ptl;
 
-	old_entry = entry = *pte;
+	entry = *pte;
 	if (!pte_present(entry)) {
-		if (pte_none(entry)) {
-			if (!vma->vm_ops || !vma->vm_ops->nopage)
-				return do_anonymous_page(mm, vma, address,
-					pte, pmd, write_access);
-			return do_no_page(mm, vma, address,
-					pte, pmd, write_access);
-		}
+		/*
+		 * If it truly wasn't present, we know that kswapd
+		 * and the PTE updates will not touch it later. So
+		 * drop the lock.
+		 */
+		if (pte_none(entry))
+			return do_no_page(mm, vma, address, write_access, pte, pmd);
 		if (pte_file(entry))
-			return do_file_page(mm, vma, address,
-					pte, pmd, write_access, entry);
-		return do_swap_page(mm, vma, address,
-					pte, pmd, write_access, entry);
+			return do_file_page(mm, vma, address, write_access, pte, pmd);
+		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
 	}
 
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
-	if (unlikely(!pte_same(*pte, entry)))
-		goto unlock;
 	if (write_access) {
 		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+			return do_wp_page(mm, vma, address, pte, pmd, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
-	if (!pte_same(old_entry, entry)) {
-		ptep_set_access_flags(vma, address, pte, entry, write_access);
-		update_mmu_cache(vma, address, entry);
-		lazy_mmu_prot_update(entry);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (write_access)
-			flush_tlb_page(vma, address);
-	}
-unlock:
-	pte_unmap_unlock(pte, ptl);
+	ptep_set_access_flags(vma, address, pte, entry, write_access);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+	pte_unmap(pte);
+	spin_unlock(&mm->page_table_lock);
 	return VM_FAULT_MINOR;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
@@ -2047,66 +2048,100 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
 
+	/*
+	 * We need the page table lock to synchronize with kswapd
+	 * and the SMP-safe atomic PTE updates.
+	 */
 	pgd = pgd_offset(mm, address);
+	spin_lock(&mm->page_table_lock);
+
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
-		return VM_FAULT_OOM;
+		goto oom;
+
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
-		return VM_FAULT_OOM;
+		goto oom;
+
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
-		return VM_FAULT_OOM;
+		goto oom;
+	
+	return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
 
-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+ oom:
+	spin_unlock(&mm->page_table_lock);
+	return VM_FAULT_OOM;
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
- * We've already handled the fast-path in-line.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
  */
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	pud_t *new = pud_alloc_one(mm, address);
-	if (!new)
-		return -ENOMEM;
+	pud_t *new;
 
+	spin_unlock(&mm->page_table_lock);
+	new = pud_alloc_one(mm, address);
 	spin_lock(&mm->page_table_lock);
-	if (pgd_present(*pgd))		/* Another has populated it */
+	if (!new)
+		return NULL;
+
+	/*
+	 * Because we dropped the lock, we should re-check the
+	 * entry, as somebody else could have populated it..
+	 */
+	if (pgd_present(*pgd)) {
 		pud_free(new);
-	else
-		pgd_populate(mm, pgd, new);
-	spin_unlock(&mm->page_table_lock);
-	return 0;
+		goto out;
+	}
+	pgd_populate(mm, pgd, new);
+ out:
+	return pud_offset(pgd, address);
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
 
 #ifndef __PAGETABLE_PMD_FOLDED
 /*
  * Allocate page middle directory.
- * We've already handled the fast-path in-line.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
  */
-int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	pmd_t *new = pmd_alloc_one(mm, address);
-	if (!new)
-		return -ENOMEM;
+	pmd_t *new;
 
+	spin_unlock(&mm->page_table_lock);
+	new = pmd_alloc_one(mm, address);
 	spin_lock(&mm->page_table_lock);
+	if (!new)
+		return NULL;
+
+	/*
+	 * Because we dropped the lock, we should re-check the
+	 * entry, as somebody else could have populated it..
+	 */
 #ifndef __ARCH_HAS_4LEVEL_HACK
-	if (pud_present(*pud))		/* Another has populated it */
+	if (pud_present(*pud)) {
 		pmd_free(new);
-	else
-		pud_populate(mm, pud, new);
+		goto out;
+	}
+	pud_populate(mm, pud, new);
 #else
-	if (pgd_present(*pud))		/* Another has populated it */
+	if (pgd_present(*pud)) {
 		pmd_free(new);
-	else
-		pgd_populate(mm, pud, new);
+		goto out;
+	}
+	pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	spin_unlock(&mm->page_table_lock);
-	return 0;
+
+ out:
+	return pmd_offset(pud, address);
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
@@ -2171,6 +2206,22 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
 
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
+/*
+ * update_mem_hiwater
+ *	- update per process rss and vm high water data
+ */
+void update_mem_hiwater(struct task_struct *tsk)
+{
+	if (tsk->mm) {
+		unsigned long rss = get_mm_counter(tsk->mm, rss);
+
+		if (tsk->mm->hiwater_rss < rss)
+			tsk->mm->hiwater_rss = rss;
+		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
+			tsk->mm->hiwater_vm = tsk->mm->total_vm;
+	}
+}
+
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
@@ -2182,7 +2233,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_page_prot = PAGE_READONLY;
-	gate_vma.vm_flags = VM_RESERVED;
+	gate_vma.vm_flags = 0;
 	return 0;
 }
 __initcall(gate_vma_init);
diff --git a/trunk/mm/memory_hotplug.c b/trunk/mm/memory_hotplug.c
deleted file mode 100644
index 431a64f021c0..000000000000
--- a/trunk/mm/memory_hotplug.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- *  linux/mm/memory_hotplug.c
- *
- *  Copyright (C)
- */
-
-#include <linux/config.h>
-#include <linux/stddef.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/interrupt.h>
-#include <linux/pagemap.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/pagevec.h>
-#include <linux/slab.h>
-#include <linux/sysctl.h>
-#include <linux/cpu.h>
-#include <linux/memory.h>
-#include <linux/memory_hotplug.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-
-#include <asm/tlbflush.h>
-
-extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-			  unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
-	int nid = pgdat->node_id;
-	int zone_type;
-
-	zone_type = zone - pgdat->node_zones;
-	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
-	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
-}
-
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
-				  int nr_pages);
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
-	int ret;
-
-	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
-
-	if (ret < 0)
-		return ret;
-
-	__add_zone(zone, phys_start_pfn);
-	return register_new_memory(__pfn_to_section(phys_start_pfn));
-}
-
-/*
- * Reasonably generic function for adding memory.  It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
-		 unsigned long nr_pages)
-{
-	unsigned long i;
-	int err = 0;
-
-	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
-		err = __add_section(zone, phys_start_pfn + i);
-
-		if (err)
-			break;
-	}
-
-	return err;
-}
-
-static void grow_zone_span(struct zone *zone,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long old_zone_end_pfn;
-
-	zone_span_writelock(zone);
-
-	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-	if (start_pfn < zone->zone_start_pfn)
-		zone->zone_start_pfn = start_pfn;
-
-	if (end_pfn > old_zone_end_pfn)
-		zone->spanned_pages = end_pfn - zone->zone_start_pfn;
-
-	zone_span_writeunlock(zone);
-}
-
-static void grow_pgdat_span(struct pglist_data *pgdat,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long old_pgdat_end_pfn =
-		pgdat->node_start_pfn + pgdat->node_spanned_pages;
-
-	if (start_pfn < pgdat->node_start_pfn)
-		pgdat->node_start_pfn = start_pfn;
-
-	if (end_pfn > old_pgdat_end_pfn)
-		pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
-}
-
-int online_pages(unsigned long pfn, unsigned long nr_pages)
-{
-	unsigned long i;
-	unsigned long flags;
-	unsigned long onlined_pages = 0;
-	struct zone *zone;
-
-	/*
-	 * This doesn't need a lock to do pfn_to_page().
-	 * The section can't be removed here because of the
-	 * memory_block->state_sem.
-	 */
-	zone = page_zone(pfn_to_page(pfn));
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	grow_zone_span(zone, pfn, pfn + nr_pages);
-	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
-
-	for (i = 0; i < nr_pages; i++) {
-		struct page *page = pfn_to_page(pfn + i);
-		online_page(page);
-		onlined_pages++;
-	}
-	zone->present_pages += onlined_pages;
-
-	setup_per_zone_pages_min();
-
-	return 0;
-}
diff --git a/trunk/mm/mempolicy.c b/trunk/mm/mempolicy.c
index 2076b1542b8a..1d5c64df1653 100644
--- a/trunk/mm/mempolicy.c
+++ b/trunk/mm/mempolicy.c
@@ -2,7 +2,6 @@
  * Simple NUMA memory policy for the Linux kernel.
  *
  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
- * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
  * Subject to the GNU Public License, version 2.
  *
  * NUMA policy allows the user to give hints in which node(s) memory should
@@ -18,19 +17,13 @@
  *                offset into the backing object or offset into the mapping
  *                for anonymous memory. For process policy an process counter
  *                is used.
- *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
- *                FIXME: memory is allocated starting with the first node
- *                to the last. It would be better if bind would truly restrict
- *                the allocation to memory nodes instead
- *
  * preferred       Try a specific node first before normal fallback.
  *                As a special case node -1 here means do the allocation
  *                on the local CPU. This is normally identical to default,
  *                but useful to set in a VMA when you have a non default
  *                process policy.
- *
  * default        Allocate on the local node first, or when on a VMA
  *                use the process policy. This is what Linux always did
  *		  in a NUMA aware kernel and still does by, ahem, default.
@@ -100,10 +93,23 @@ struct mempolicy default_policy = {
 	.policy = MPOL_DEFAULT,
 };
 
+/* Check if all specified nodes are online */
+static int nodes_online(unsigned long *nodes)
+{
+	DECLARE_BITMAP(online2, MAX_NUMNODES);
+
+	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
+	if (bitmap_empty(online2, MAX_NUMNODES))
+		set_bit(0, online2);
+	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
+		return -EINVAL;
+	return 0;
+}
+
 /* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int mpol_check_policy(int mode, unsigned long *nodes)
 {
-	int empty = nodes_empty(*nodes);
+	int empty = bitmap_empty(nodes, MAX_NUMNODES);
 
 	switch (mode) {
 	case MPOL_DEFAULT:
@@ -118,20 +124,71 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 			return -EINVAL;
 		break;
 	}
-	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+	return nodes_online(nodes);
+}
+
+/* Copy a node mask from user space. */
+static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
+		     unsigned long maxnode, int mode)
+{
+	unsigned long k;
+	unsigned long nlongs;
+	unsigned long endmask;
+
+	--maxnode;
+	bitmap_zero(nodes, MAX_NUMNODES);
+	if (maxnode == 0 || !nmask)
+		return 0;
+
+	nlongs = BITS_TO_LONGS(maxnode);
+	if ((maxnode % BITS_PER_LONG) == 0)
+		endmask = ~0UL;
+	else
+		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+	/* When the user specified more nodes than supported just check
+	   if the non supported part is all zero. */
+	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+		if (nlongs > PAGE_SIZE/sizeof(long))
+			return -EINVAL;
+		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+			unsigned long t;
+			if (get_user(t,  nmask + k))
+				return -EFAULT;
+			if (k == nlongs - 1) {
+				if (t & endmask)
+					return -EINVAL;
+			} else if (t)
+				return -EINVAL;
+		}
+		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+		endmask = ~0UL;
+	}
+
+	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
+		return -EFAULT;
+	nodes[nlongs-1] &= endmask;
+	/* Update current mems_allowed */
+	cpuset_update_current_mems_allowed();
+	/* Ignore nodes not set in current->mems_allowed */
+	cpuset_restrict_to_mems_allowed(nodes);
+	return mpol_check_policy(mode, nodes);
 }
+
 /* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+static struct zonelist *bind_zonelist(unsigned long *nodes)
 {
 	struct zonelist *zl;
 	int num, max, nd;
 
-	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
 	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 	if (!zl)
 		return NULL;
 	num = 0;
-	for_each_node_mask(nd, *nodes) {
+	for (nd = find_first_bit(nodes, MAX_NUMNODES);
+	     nd < MAX_NUMNODES;
+	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
 		int k;
 		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -142,16 +199,17 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 				policy_zone = k;
 		}
 	}
+	BUG_ON(num >= max);
 	zl->zones[num] = NULL;
 	return zl;
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 {
 	struct mempolicy *policy;
 
-	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
+	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
 	if (mode == MPOL_DEFAULT)
 		return NULL;
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -160,10 +218,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	atomic_set(&policy->refcnt, 1);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
+		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
 		break;
 	case MPOL_PREFERRED:
-		policy->v.preferred_node = first_node(*nodes);
+		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
 		if (policy->v.preferred_node >= MAX_NUMNODES)
 			policy->v.preferred_node = -1;
 		break;
@@ -180,14 +238,14 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 }
 
 /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
-	spinlock_t *ptl;
 
-	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	spin_lock(&mm->page_table_lock);
+	orig_pte = pte = pte_offset_map(pmd, addr);
 	do {
 		unsigned long pfn;
 		unsigned int nid;
@@ -195,20 +253,19 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!pte_present(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn)) {
-			print_bad_pte(vma, *pte, addr);
+		if (!pfn_valid(pfn))
 			continue;
-		}
 		nid = pfn_to_nid(pfn);
-		if (!node_isset(nid, *nodes))
+		if (!test_bit(nid, nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(orig_pte, ptl);
+	pte_unmap(orig_pte);
+	spin_unlock(&mm->page_table_lock);
 	return addr != end;
 }
 
-static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -218,14 +275,14 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes))
+		if (check_pte_range(mm, pmd, addr, next, nodes))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -235,24 +292,24 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes))
+		if (check_pmd_range(mm, pud, addr, next, nodes))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pgd_range(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+static inline int check_pgd_range(struct mm_struct *mm,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
 {
 	pgd_t *pgd;
 	unsigned long next;
 
-	pgd = pgd_offset(vma->vm_mm, addr);
+	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes))
+		if (check_pud_range(mm, pgd, addr, next, nodes))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
@@ -261,7 +318,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 /* Step 1: check the range */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    nodemask_t *nodes, unsigned long flags)
+	    unsigned long *nodes, unsigned long flags)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -269,8 +326,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
-	if (first->vm_flags & VM_RESERVED)
-		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		if (!vma->vm_next && vma->vm_end < end)
@@ -283,7 +338,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma, start, endvma, nodes);
+			err = check_pgd_range(vma->vm_mm,
+					   start, endvma, nodes);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -337,25 +393,17 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 	return err;
 }
 
-static int contextualize_policy(int mode, nodemask_t *nodes)
-{
-	if (!nodes)
-		return 0;
-
-	/* Update current mems_allowed */
-	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes->bits);
-	return mpol_check_policy(mode, nodes);
-}
-
-long do_mbind(unsigned long start, unsigned long len,
-		unsigned long mode, nodemask_t *nmask, unsigned long flags)
+/* Change policy for a memory range */
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+			  unsigned long mode,
+			  unsigned long __user *nmask, unsigned long maxnode,
+			  unsigned flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
+	DECLARE_BITMAP(nodes, MAX_NUMNODES);
 	int err;
 
 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -370,17 +418,20 @@ long do_mbind(unsigned long start, unsigned long len,
 		return -EINVAL;
 	if (end == start)
 		return 0;
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-	new = mpol_new(mode, nmask);
+
+	err = get_nodes(nodes, nmask, maxnode, mode);
+	if (err)
+		return err;
+
+	new = mpol_new(mode, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-			mode,nodes_addr(nodes)[0]);
+			mode,nodes[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags);
+	vma = check_range(mm, start, end, nodes, flags);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma))
 		err = mbind_range(vma, start, end, new);
@@ -390,45 +441,50 @@ long do_mbind(unsigned long start, unsigned long len,
 }
 
 /* Set the process memory policy */
-long do_set_mempolicy(int mode, nodemask_t *nodes)
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+				   unsigned long maxnode)
 {
+	int err;
 	struct mempolicy *new;
+	DECLARE_BITMAP(nodes, MAX_NUMNODES);
 
-	if (contextualize_policy(mode, nodes))
+	if (mode < 0 || mode > MPOL_MAX)
 		return -EINVAL;
+	err = get_nodes(nodes, nmask, maxnode, mode);
+	if (err)
+		return err;
 	new = mpol_new(mode, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
 	if (new && new->policy == MPOL_INTERLEAVE)
-		current->il_next = first_node(new->v.nodes);
+		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
 	return 0;
 }
 
 /* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
 {
 	int i;
 
-	nodes_clear(*nodes);
+	bitmap_zero(nodes, MAX_NUMNODES);
 	switch (p->policy) {
 	case MPOL_BIND:
 		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
-				*nodes);
+			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
 		break;
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		*nodes = p->v.nodes;
+		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
 		break;
 	case MPOL_PREFERRED:
 		/* or use current node instead of online map? */
 		if (p->v.preferred_node < 0)
-			*nodes = node_online_map;
+			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
 		else
-			node_set(p->v.preferred_node, *nodes);
+			__set_bit(p->v.preferred_node, nodes);
 		break;
 	default:
 		BUG();
@@ -448,17 +504,37 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
 	return err;
 }
 
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+			      void *nodes, unsigned nbytes)
+{
+	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+
+	if (copy > nbytes) {
+		if (copy > PAGE_SIZE)
+			return -EINVAL;
+		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+			return -EFAULT;
+		copy = nbytes;
+	}
+	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
+}
+
 /* Retrieve NUMA policy */
-long do_get_mempolicy(int *policy, nodemask_t *nmask,
-			unsigned long addr, unsigned long flags)
+asmlinkage long sys_get_mempolicy(int __user *policy,
+				  unsigned long __user *nmask,
+				  unsigned long maxnode,
+				  unsigned long addr, unsigned long flags)
 {
-	int err;
+	int err, pval;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
+	if (nmask != NULL && maxnode < MAX_NUMNODES)
+		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
 		down_read(&mm->mmap_sem);
 		vma = find_vma_intersection(mm, addr, addr+1);
@@ -481,25 +557,31 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			err = lookup_node(mm, addr);
 			if (err < 0)
 				goto out;
-			*policy = err;
+			pval = err;
 		} else if (pol == current->mempolicy &&
 				pol->policy == MPOL_INTERLEAVE) {
-			*policy = current->il_next;
+			pval = current->il_next;
 		} else {
 			err = -EINVAL;
 			goto out;
 		}
 	} else
-		*policy = pol->policy;
+		pval = pol->policy;
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
 		vma = NULL;
 	}
 
+	if (policy && put_user(pval, policy))
+		return -EFAULT;
+
 	err = 0;
-	if (nmask)
-		get_zonemask(pol, nmask);
+	if (nmask) {
+		DECLARE_BITMAP(nodes, MAX_NUMNODES);
+		get_zonemask(pol, nodes);
+		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
+	}
 
  out:
 	if (vma)
@@ -507,126 +589,6 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
-/*
- * User space interface with variable sized bitmaps for nodelists.
- */
-
-/* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
-		     unsigned long maxnode)
-{
-	unsigned long k;
-	unsigned long nlongs;
-	unsigned long endmask;
-
-	--maxnode;
-	nodes_clear(*nodes);
-	if (maxnode == 0 || !nmask)
-		return 0;
-
-	nlongs = BITS_TO_LONGS(maxnode);
-	if ((maxnode % BITS_PER_LONG) == 0)
-		endmask = ~0UL;
-	else
-		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
-	/* When the user specified more nodes than supported just check
-	   if the non supported part is all zero. */
-	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
-		if (nlongs > PAGE_SIZE/sizeof(long))
-			return -EINVAL;
-		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
-			unsigned long t;
-			if (get_user(t, nmask + k))
-				return -EFAULT;
-			if (k == nlongs - 1) {
-				if (t & endmask)
-					return -EINVAL;
-			} else if (t)
-				return -EINVAL;
-		}
-		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
-		endmask = ~0UL;
-	}
-
-	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
-		return -EFAULT;
-	nodes_addr(*nodes)[nlongs-1] &= endmask;
-	return 0;
-}
-
-/* Copy a kernel node mask to user space */
-static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
-			      nodemask_t *nodes)
-{
-	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
-
-	if (copy > nbytes) {
-		if (copy > PAGE_SIZE)
-			return -EINVAL;
-		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
-			return -EFAULT;
-		copy = nbytes;
-	}
-	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
-}
-
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
-			unsigned long mode,
-			unsigned long __user *nmask, unsigned long maxnode,
-			unsigned flags)
-{
-	nodemask_t nodes;
-	int err;
-
-	err = get_nodes(&nodes, nmask, maxnode);
-	if (err)
-		return err;
-	return do_mbind(start, len, mode, &nodes, flags);
-}
-
-/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
-		unsigned long maxnode)
-{
-	int err;
-	nodemask_t nodes;
-
-	if (mode < 0 || mode > MPOL_MAX)
-		return -EINVAL;
-	err = get_nodes(&nodes, nmask, maxnode);
-	if (err)
-		return err;
-	return do_set_mempolicy(mode, &nodes);
-}
-
-/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
-				unsigned long __user *nmask,
-				unsigned long maxnode,
-				unsigned long addr, unsigned long flags)
-{
-	int err, pval;
-	nodemask_t nodes;
-
-	if (nmask != NULL && maxnode < MAX_NUMNODES)
-		return -EINVAL;
-
-	err = do_get_mempolicy(&pval, &nodes, addr, flags);
-
-	if (err)
-		return err;
-
-	if (policy && put_user(pval, policy))
-		return -EFAULT;
-
-	if (nmask)
-		err = copy_nodes_to_user(nmask, maxnode, &nodes);
-
-	return err;
-}
-
 #ifdef CONFIG_COMPAT
 
 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -687,15 +649,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 	long err = 0;
 	unsigned long __user *nm = NULL;
 	unsigned long nr_bits, alloc_size;
-	nodemask_t bm;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
 
 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 
 	if (nmask) {
-		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
+		err = compat_get_bitmap(bm, nmask, nr_bits);
 		nm = compat_alloc_user_space(alloc_size);
-		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
+		err |= copy_to_user(nm, bm, alloc_size);
 	}
 
 	if (err)
@@ -714,7 +676,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
-			pol = vma->vm_ops->get_policy(vma, addr);
+		        pol = vma->vm_ops->get_policy(vma, addr);
 		else if (vma->vm_policy &&
 				vma->vm_policy->policy != MPOL_DEFAULT)
 			pol = vma->vm_policy;
@@ -760,9 +722,10 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
+	BUG_ON(nid >= MAX_NUMNODES);
+	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
 	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
 	me->il_next = next;
 	return nid;
 }
@@ -771,27 +734,29 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
-	unsigned nnodes = nodes_weight(pol->v.nodes);
+	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
 	unsigned target = (unsigned)off % nnodes;
 	int c;
 	int nid = -1;
 
 	c = 0;
 	do {
-		nid = next_node(nid, pol->v.nodes);
+		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
 		c++;
 	} while (c <= target);
+	BUG_ON(nid >= MAX_NUMNODES);
+	BUG_ON(!test_bit(nid, pol->v.nodes));
 	return nid;
 }
 
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
-					unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
 {
 	struct zonelist *zl;
 	struct page *page;
 
+	BUG_ON(!node_online(nid));
 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zl->zones[0]) {
@@ -834,6 +799,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		unsigned nid;
 		if (vma) {
 			unsigned long off;
+			BUG_ON(addr >= vma->vm_end);
+			BUG_ON(addr < vma->vm_start);
 			off = vma->vm_pgoff;
 			off += (addr - vma->vm_start) >> PAGE_SHIFT;
 			nid = offset_il_node(pol, vma, off);
@@ -911,7 +878,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_DEFAULT:
 		return 1;
 	case MPOL_INTERLEAVE:
-		return nodes_equal(a->v.nodes, b->v.nodes);
+		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
 	case MPOL_BIND: {
@@ -1150,7 +1117,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
 		 vma->vm_pgoff,
 		 sz, npol? npol->policy : -1,
-		npol ? nodes_addr(npol->v.nodes)[0] : -1);
+		npol ? npol->v.nodes[0] : -1);
 
 	if (npol) {
 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1197,12 +1164,14 @@ void __init numa_policy_init(void)
 	/* Set interleaving policy for system init. This way not all
 	   the data structures allocated at system boot end up in node zero. */
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
+							MAX_NUMNODES) < 0)
 		printk("numa_policy_init: interleaving failed\n");
 }
 
-/* Reset policy of current process to default */
+/* Reset policy of current process to default.
+ * Assumes fs == KERNEL_DS */
 void numa_default_policy(void)
 {
-	do_set_mempolicy(MPOL_DEFAULT, NULL);
+	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
 }
diff --git a/trunk/mm/mmap.c b/trunk/mm/mmap.c
index 5ecc2cf3e1d7..fa11d91242e8 100644
--- a/trunk/mm/mmap.c
+++ b/trunk/mm/mmap.c
@@ -181,36 +181,26 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 }
 
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
- * vma from rmap and vmtruncate before freeing its page tables.
+ * Remove one vm structure and free it.
  */
-void unlink_file_vma(struct vm_area_struct *vma)
+static void remove_vm_struct(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 
+	might_sleep();
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		spin_lock(&mapping->i_mmap_lock);
 		__remove_shared_vm_struct(vma, file, mapping);
 		spin_unlock(&mapping->i_mmap_lock);
 	}
-}
-
-/*
- * Close a vm structure and free it, returning the next.
- */
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
-{
-	struct vm_area_struct *next = vma->vm_next;
-
-	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file)
-		fput(vma->vm_file);
+	if (file)
+		fput(file);
+	anon_vma_unlink(vma);
 	mpol_free(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
-	return next;
 }
 
 asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -842,7 +832,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
 						struct file *file, long pages)
 {
 	const unsigned long stack_flags
@@ -1080,17 +1070,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
-		if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
-						== (VM_WRITE | VM_RESERVED)) {
-			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
-				"PROT_WRITE mmap of VM_RESERVED memory, which "
-				"is deprecated. Please report this to "
-				"linux-kernel@vger.kernel.org\n",current->comm);
-			if (vma->vm_ops && vma->vm_ops->close)
-				vma->vm_ops->close(vma);
-			error = -EACCES;
-			goto unmap_and_free_vma;
-		}
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
 		if (error)
@@ -1131,7 +1110,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	}
 out:	
 	mm->total_vm += len >> PAGE_SHIFT;
-	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
@@ -1496,19 +1475,15 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 	mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
-	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
-#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+#ifdef CONFIG_STACK_GROWSUP
 /*
- * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
- * vma is the last one with address > vma->vm_end.  Have to extend vma.
+ * vma is the first one with address > vma->vm_end.  Have to extend vma.
  */
-#ifdef CONFIG_STACK_GROWSUP
-static inline
-#endif
-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+int expand_stack(struct vm_area_struct * vma, unsigned long address)
 {
 	int error;
 
@@ -1546,13 +1521,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	anon_vma_unlock(vma);
 	return error;
 }
-#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
-
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
-	return expand_upwards(vma, address);
-}
 
 struct vm_area_struct *
 find_extend_vma(struct mm_struct *mm, unsigned long addr)
@@ -1635,24 +1603,36 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 }
 #endif
 
+/* Normal function to fix up a mapping
+ * This function is the default for when an area has no specific
+ * function.  This may be used as part of a more specific routine.
+ *
+ * By the time this function is called, the area struct has been
+ * removed from the process mapping list.
+ */
+static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
+{
+	size_t len = area->vm_end - area->vm_start;
+
+	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
+	if (area->vm_flags & VM_LOCKED)
+		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
+	vm_stat_unaccount(area);
+	remove_vm_struct(area);
+}
+
 /*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
+ * Update the VMA and inode share lists.
  *
- * Called with the mm semaphore held.
+ * Ok - we have the memory areas we should free on the 'free' list,
+ * so release them, and do the vma updates.
  */
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	/* Update high watermark before we lower total_vm */
-	update_hiwater_vm(mm);
 	do {
-		long nrpages = vma_pages(vma);
-
-		mm->total_vm -= nrpages;
-		if (vma->vm_flags & VM_LOCKED)
-			mm->locked_vm -= nrpages;
-		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
-		vma = remove_vma(vma);
+		struct vm_area_struct *next = vma->vm_next;
+		unmap_vma(mm, vma);
+		vma = next;
 	} while (vma);
 	validate_mm(mm);
 }
@@ -1671,13 +1651,14 @@ static void unmap_region(struct mm_struct *mm,
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
+	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
-	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	spin_unlock(&mm->page_table_lock);
 }
 
 /*
@@ -1818,7 +1799,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	unmap_region(mm, vma, prev, start, end);
 
 	/* Fix up all other VM information */
-	remove_vma_list(mm, vma);
+	unmap_vma_list(mm, vma);
 
 	return 0;
 }
@@ -1952,21 +1933,34 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long end;
 
 	lru_add_drain();
+
+	spin_lock(&mm->page_table_lock);
+
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
-	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
+	mm->mmap = mm->mmap_cache = NULL;
+	mm->mm_rb = RB_ROOT;
+	set_mm_counter(mm, rss, 0);
+	mm->total_vm = 0;
+	mm->locked_vm = 0;
+
+	spin_unlock(&mm->page_table_lock);
+
 	/*
-	 * Walk the list again, actually closing and freeing it,
-	 * with preemption enabled, without holding any MM locks.
+	 * Walk the list again, actually closing and freeing it
+	 * without holding any MM locks.
 	 */
-	while (vma)
-		vma = remove_vma(vma);
+	while (vma) {
+		struct vm_area_struct *next = vma->vm_next;
+		remove_vm_struct(vma);
+		vma = next;
+	}
 
 	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
diff --git a/trunk/mm/mprotect.c b/trunk/mm/mprotect.c
index 17a2b52b753b..57577f63b305 100644
--- a/trunk/mm/mprotect.c
+++ b/trunk/mm/mprotect.c
@@ -29,9 +29,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot)
 {
 	pte_t *pte;
-	spinlock_t *ptl;
 
-	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	pte = pte_offset_map(pmd, addr);
 	do {
 		if (pte_present(*pte)) {
 			pte_t ptent;
@@ -45,7 +44,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			lazy_mmu_prot_update(ptent);
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
+	pte_unmap(pte - 1);
 }
 
 static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -89,6 +88,7 @@ static void change_protection(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
@@ -96,6 +96,7 @@ static void change_protection(struct vm_area_struct *vma,
 		change_pud_range(mm, pgd, addr, next, newprot);
 	} while (pgd++, addr = next, addr != end);
 	flush_tlb_range(vma, start, end);
+	spin_unlock(&mm->page_table_lock);
 }
 
 static int
@@ -124,14 +125,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
-		if (oldflags & VM_RESERVED) {
-			BUG_ON(oldflags & VM_WRITE);
-			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
-				"PROT_WRITE mprotect of VM_RESERVED memory, "
-				"which is deprecated. Please report this to "
-				"linux-kernel@vger.kernel.org\n",current->comm);
-			return -EACCES;
-		}
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
@@ -175,8 +168,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
 	change_protection(vma, start, end, newprot);
-	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
 
 fail:
diff --git a/trunk/mm/mremap.c b/trunk/mm/mremap.c
index b535438c363c..f343fc73a8bd 100644
--- a/trunk/mm/mremap.c
+++ b/trunk/mm/mremap.c
@@ -22,7 +22,35 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none_or_clear_bad(pgd))
+		goto end;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none_or_clear_bad(pud))
+		goto end;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none_or_clear_bad(pmd))
+		goto end;
+
+	pte = pte_offset_map_nested(pmd, addr);
+	if (pte_none(*pte)) {
+		pte_unmap_nested(pte);
+		pte = NULL;
+	}
+end:
+	return pte;
+}
+
+static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -40,39 +68,35 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 	if (pmd_none_or_clear_bad(pmd))
 		return NULL;
 
-	return pmd;
+	return pte_offset_map(pmd, addr);
 }
 
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
+	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
+
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
 		return NULL;
-
 	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		return NULL;
-
-	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
-		return NULL;
-
-	return pmd;
+	if (pmd)
+		pte = pte_alloc_map(mm, pmd, addr);
+	return pte;
 }
 
-static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
-		unsigned long old_addr, unsigned long old_end,
-		struct vm_area_struct *new_vma, pmd_t *new_pmd,
-		unsigned long new_addr)
+static int
+move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
+		struct vm_area_struct *new_vma, unsigned long new_addr)
 {
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
-	pte_t *old_pte, *new_pte, pte;
-	spinlock_t *old_ptl, *new_ptl;
+	int error = 0;
+	pte_t *src, *dst;
 
 	if (vma->vm_file) {
 		/*
@@ -87,69 +111,74 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		    new_vma->vm_truncate_count != vma->vm_truncate_count)
 			new_vma->vm_truncate_count = 0;
 	}
+	spin_lock(&mm->page_table_lock);
 
-	/*
-	 * We don't have to worry about the ordering of src and dst
-	 * pte locks because exclusive mmap_sem prevents deadlock.
-	 */
-	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
-	new_ptl = pte_lockptr(mm, new_pmd);
-	if (new_ptl != old_ptl)
-		spin_lock(new_ptl);
-
-	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
-				   new_pte++, new_addr += PAGE_SIZE) {
-		if (pte_none(*old_pte))
-			continue;
-		pte = ptep_clear_flush(vma, old_addr, old_pte);
-		/* ZERO_PAGE can be dependant on virtual addr */
-		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
-		set_pte_at(mm, new_addr, new_pte, pte);
+	src = get_one_pte_map_nested(mm, old_addr);
+	if (src) {
+		/*
+		 * Look to see whether alloc_one_pte_map needs to perform a
+		 * memory allocation.  If it does then we need to drop the
+		 * atomic kmap
+		 */
+		dst = get_one_pte_map(mm, new_addr);
+		if (unlikely(!dst)) {
+			pte_unmap_nested(src);
+			if (mapping)
+				spin_unlock(&mapping->i_mmap_lock);
+			dst = alloc_one_pte_map(mm, new_addr);
+			if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
+				spin_unlock(&mm->page_table_lock);
+				spin_lock(&mapping->i_mmap_lock);
+				spin_lock(&mm->page_table_lock);
+			}
+			src = get_one_pte_map_nested(mm, old_addr);
+		}
+		/*
+		 * Since alloc_one_pte_map can drop and re-acquire
+		 * page_table_lock, we should re-check the src entry...
+		 */
+		if (src) {
+			if (dst) {
+				pte_t pte;
+				pte = ptep_clear_flush(vma, old_addr, src);
+
+				/* ZERO_PAGE can be dependant on virtual addr */
+				pte = move_pte(pte, new_vma->vm_page_prot,
+							old_addr, new_addr);
+				set_pte_at(mm, new_addr, dst, pte);
+			} else
+				error = -ENOMEM;
+			pte_unmap_nested(src);
+		}
+		if (dst)
+			pte_unmap(dst);
 	}
-
-	if (new_ptl != old_ptl)
-		spin_unlock(new_ptl);
-	pte_unmap_nested(new_pte - 1);
-	pte_unmap_unlock(old_pte - 1, old_ptl);
+	spin_unlock(&mm->page_table_lock);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
+	return error;
 }
 
-#define LATENCY_LIMIT	(64 * PAGE_SIZE)
-
 static unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len)
 {
-	unsigned long extent, next, old_end;
-	pmd_t *old_pmd, *new_pmd;
+	unsigned long offset;
 
-	old_end = old_addr + len;
-	flush_cache_range(vma, old_addr, old_end);
+	flush_cache_range(vma, old_addr, old_addr + len);
 
-	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
-		cond_resched();
-		next = (old_addr + PMD_SIZE) & PMD_MASK;
-		if (next - 1 > old_end)
-			next = old_end;
-		extent = next - old_addr;
-		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
-		if (!old_pmd)
-			continue;
-		new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
-		if (!new_pmd)
+	/*
+	 * This is not the clever way to do this, but we're taking the
+	 * easy way out on the assumption that most remappings will be
+	 * only a few pages.. This also makes error recovery easier.
+	 */
+	for (offset = 0; offset < len; offset += PAGE_SIZE) {
+		if (move_one_page(vma, old_addr + offset,
+				new_vma, new_addr + offset) < 0)
 			break;
-		next = (new_addr + PMD_SIZE) & PMD_MASK;
-		if (extent > next - new_addr)
-			extent = next - new_addr;
-		if (extent > LATENCY_LIMIT)
-			extent = LATENCY_LIMIT;
-		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-				new_vma, new_pmd, new_addr);
+		cond_resched();
 	}
-
-	return len + old_addr - old_end;	/* how much done */
+	return offset;
 }
 
 static unsigned long move_vma(struct vm_area_struct *vma,
@@ -162,7 +191,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long new_pgoff;
 	unsigned long moved_len;
 	unsigned long excess = 0;
-	unsigned long hiwater_vm;
 	int split = 0;
 
 	/*
@@ -201,24 +229,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	/*
-	 * If we failed to move page tables we still do total_vm increment
-	 * since do_munmap() will decrement it by old_len == new_len.
-	 *
-	 * Since total_vm is about to be raised artificially high for a
-	 * moment, we need to restore high watermark afterwards: if stats
-	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
-	 * If this were a serious issue, we'd add a flag to do_munmap().
+	 * if we failed to move page tables we still do total_vm increment
+	 * since do_munmap() will decrement it by old_len == new_len
 	 */
-	hiwater_vm = mm->hiwater_vm;
 	mm->total_vm += new_len >> PAGE_SHIFT;
-	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
-	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
@@ -248,7 +269,6 @@ unsigned long do_mremap(unsigned long addr,
 	unsigned long old_len, unsigned long new_len,
 	unsigned long flags, unsigned long new_addr)
 {
-	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
@@ -289,7 +309,7 @@ unsigned long do_mremap(unsigned long addr,
 		if ((addr <= new_addr) && (addr+old_len) > new_addr)
 			goto out;
 
-		ret = do_munmap(mm, new_addr, new_len);
+		ret = do_munmap(current->mm, new_addr, new_len);
 		if (ret)
 			goto out;
 	}
@@ -300,7 +320,7 @@ unsigned long do_mremap(unsigned long addr,
 	 * do_munmap does all the needed commit accounting
 	 */
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
 		if (ret && old_len != new_len)
 			goto out;
 		ret = addr;
@@ -313,7 +333,7 @@ unsigned long do_mremap(unsigned long addr,
 	 * Ok, we need to grow..  or relocate.
 	 */
 	ret = -EFAULT;
-	vma = find_vma(mm, addr);
+	vma = find_vma(current->mm, addr);
 	if (!vma || vma->vm_start > addr)
 		goto out;
 	if (is_vm_hugetlb_page(vma)) {
@@ -329,14 +349,14 @@ unsigned long do_mremap(unsigned long addr,
 	}
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
-		locked = mm->locked_vm << PAGE_SHIFT;
+		locked = current->mm->locked_vm << PAGE_SHIFT;
 		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += new_len - old_len;
 		ret = -EAGAIN;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			goto out;
 	}
-	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
+	if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -363,10 +383,11 @@ unsigned long do_mremap(unsigned long addr,
 			vma_adjust(vma, vma->vm_start,
 				addr + new_len, vma->vm_pgoff, NULL);
 
-			mm->total_vm += pages;
-			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
+			current->mm->total_vm += pages;
+			__vm_stat_account(vma->vm_mm, vma->vm_flags,
+							vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
-				mm->locked_vm += pages;
+				current->mm->locked_vm += pages;
 				make_pages_present(addr + old_len,
 						   addr + new_len);
 			}
diff --git a/trunk/mm/msync.c b/trunk/mm/msync.c
index 0e040e9c39d8..d0f5a1bce7cb 100644
--- a/trunk/mm/msync.c
+++ b/trunk/mm/msync.c
@@ -17,48 +17,40 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+/*
+ * Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
+
+static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
-	spinlock_t *ptl;
-	int progress = 0;
 
-again:
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map(pmd, addr);
 	do {
 		unsigned long pfn;
 		struct page *page;
 
-		if (progress >= 64) {
-			progress = 0;
-			if (need_resched() || need_lockbreak(ptl))
-				break;
-		}
-		progress++;
 		if (!pte_present(*pte))
 			continue;
 		if (!pte_maybe_dirty(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, *pte, addr);
+		if (!pfn_valid(pfn))
 			continue;
-		}
 		page = pfn_to_page(pfn);
+		if (PageReserved(page))
+			continue;
 
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
-	if (addr != end)
-		goto again;
+	pte_unmap(pte - 1);
 }
 
-static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
@@ -69,11 +61,11 @@ static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		msync_pte_range(vma, pmd, addr, next);
+		sync_pte_range(vma, pmd, addr, next);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
@@ -84,34 +76,58 @@ static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		msync_pmd_range(vma, pud, addr, next);
+		sync_pmd_range(vma, pud, addr, next);
 	} while (pud++, addr = next, addr != end);
 }
 
-static void msync_page_range(struct vm_area_struct *vma,
+static void sync_page_range(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	unsigned long next;
 
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
-	 * to do anything more on an msync().
-	 * Can't do anything with VM_RESERVED regions either.
-	 */
-	if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
+	 * to do anything more on an msync() */
+	if (is_vm_hugetlb_page(vma))
 		return;
 
 	BUG_ON(addr >= end);
-	pgd = pgd_offset(vma->vm_mm, addr);
+	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		msync_pud_range(vma, pgd, addr, next);
+		sync_pud_range(vma, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
+}
+
+#ifdef CONFIG_PREEMPT
+static inline void filemap_sync(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	const size_t chunk = 64 * 1024;	/* bytes */
+	unsigned long next;
+
+	do {
+		next = addr + chunk;
+		if (next > end || next < addr)
+			next = end;
+		sync_page_range(vma, addr, next);
+		cond_resched();
+	} while (addr = next, addr != end);
+}
+#else
+static inline void filemap_sync(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	sync_page_range(vma, addr, end);
 }
+#endif
 
 /*
  * MS_SYNC syncs the entire file - including mappings.
@@ -134,7 +150,7 @@ static int msync_interval(struct vm_area_struct *vma,
 		return -EBUSY;
 
 	if (file && (vma->vm_flags & VM_SHARED)) {
-		msync_page_range(vma, addr, end);
+		filemap_sync(vma, addr, end);
 
 		if (flags & MS_SYNC) {
 			struct address_space *mapping = file->f_mapping;
diff --git a/trunk/mm/nommu.c b/trunk/mm/nommu.c
index d1e076a487cb..0ef241ae3763 100644
--- a/trunk/mm/nommu.c
+++ b/trunk/mm/nommu.c
@@ -931,8 +931,6 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 	realalloc -= kobjsize(vml);
 	askedalloc -= sizeof(*vml);
 	kfree(vml);
-
-	update_hiwater_vm(mm);
 	mm->total_vm -= len >> PAGE_SHIFT;
 
 #ifdef DEBUG
@@ -1049,8 +1047,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
-			unsigned int foll_flags)
+struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
 {
 	return NULL;
 }
@@ -1081,6 +1078,19 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
+void update_mem_hiwater(struct task_struct *tsk)
+{
+	unsigned long rss;
+
+	if (likely(tsk->mm)) {
+		rss = get_mm_counter(tsk->mm, rss);
+		if (tsk->mm->hiwater_rss < rss)
+			tsk->mm->hiwater_rss = rss;
+		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
+			tsk->mm->hiwater_vm = tsk->mm->total_vm;
+	}
+}
+
 void unmap_mapping_range(struct address_space *mapping,
 			 loff_t const holebegin, loff_t const holelen,
 			 int even_cows)
diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c
index 2dbdd98426fd..94c864eac9c4 100644
--- a/trunk/mm/page_alloc.c
+++ b/trunk/mm/page_alloc.c
@@ -33,7 +33,6 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 
@@ -79,44 +78,21 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
-{
-	int ret = 0;
-	unsigned seq;
-	unsigned long pfn = page_to_pfn(page);
-
-	do {
-		seq = zone_span_seqbegin(zone);
-		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
-			ret = 1;
-		else if (pfn < zone->zone_start_pfn)
-			ret = 1;
-	} while (zone_span_seqretry(zone, seq));
-
-	return ret;
-}
-
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
-#ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
-		return 0;
-#endif
-	if (zone != page_zone(page))
-		return 0;
-
-	return 1;
-}
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
-	if (page_outside_zone_boundaries(zone, page))
+	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
 		return 1;
-	if (!page_is_consistent(zone, page))
+	if (page_to_pfn(page) < zone->zone_start_pfn)
+		return 1;
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 1;
+#endif
+	if (zone != page_zone(page))
 		return 1;
-
 	return 0;
 }
 
@@ -138,8 +114,7 @@ static void bad_page(const char *function, struct page *page)
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved );
+			1 << PG_writeback);
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -178,7 +153,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 		struct page *p = page + i;
 
 		SetPageCompound(p);
-		set_page_private(p, (unsigned long)page);
+		p->private = (unsigned long)page;
 	}
 }
 
@@ -198,7 +173,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
-		if (page_private(p) != (unsigned long)page)
+		if (p->private != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
@@ -211,18 +186,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page) {
-	return page_private(page);
+	return page->private;
 }
 
 static inline void set_page_order(struct page *page, int order) {
-	set_page_private(page, order);
+	page->private = order;
 	__SetPagePrivate(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPagePrivate(page);
-	set_page_private(page, 0);
+	page->private = 0;
 }
 
 /*
@@ -262,13 +237,14 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
  * (c) a page and its buddy have the same order.
- * for recording page's order, we use page_private(page) and PG_private.
+ * for recording page's order, we use page->private and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
 {
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
+           !PageReserved(page)         &&
             page_count(page) == 0)
                return 1;
        return 0;
@@ -288,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page_private(page) field.
+ * order is recorded in page->private field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
  * free, the remainder of the region must be split into blocks.   
@@ -351,8 +327,7 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved )))
+			1 << PG_writeback )))
 		bad_page(function, page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -480,14 +455,13 @@ static void prep_new_page(struct page *page, int order)
 			1 << PG_reclaim	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved )))
+			1 << PG_writeback )))
 		bad_page(__FUNCTION__, page);
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
-	set_page_private(page, 0);
+	page->private = 0;
 	set_page_refs(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 }
@@ -1042,7 +1016,7 @@ void __pagevec_free(struct pagevec *pvec)
 
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
-	if (put_page_testzero(page)) {
+	if (!PageReserved(page) && put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
@@ -1331,9 +1305,12 @@ void show_free_areas(void)
 		} else
 			printk("\n");
 
-		for_each_cpu(cpu) {
+		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
 			struct per_cpu_pageset *pageset;
 
+			if (!cpu_possible(cpu))
+				continue;
+
 			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)
@@ -1683,7 +1660,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
-void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
@@ -1697,7 +1674,7 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 1);
+		set_page_count(page, 0);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
@@ -1744,29 +1721,29 @@ static int __devinit zone_batchsize(struct zone *zone)
 
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
-	 * size of the zone.  But no more than 1/2 of a meg.
+	 * size of the zone.  But no more than 1/4 of a meg - there's
+	 * no point in going beyond the size of L2 cache.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
-	if (batch * PAGE_SIZE > 512 * 1024)
-		batch = (512 * 1024) / PAGE_SIZE;
+	if (batch * PAGE_SIZE > 256 * 1024)
+		batch = (256 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 
 	/*
-	 * We will be trying to allcoate bigger chunks of contiguous
-	 * memory of the order of fls(batch).  This should result in
-	 * better cache coloring.
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
 	 *
-	 * A sanity check also to ensure that batch is still in limits.
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
 	 */
-	batch = (1 << fls(batch + batch/2));
-
-	if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
-		batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
-
+	batch = (1 << fls(batch + batch/2)) - 1;
 	return batch;
 }
 
@@ -1778,7 +1755,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 
 	pcp = &p->pcp[0];		/* hot */
 	pcp->count = 0;
-	pcp->low = 0;
+	pcp->low = 2 * batch;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
@@ -1787,7 +1764,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 	pcp->count = 0;
 	pcp->low = 0;
 	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
+	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 }
 
@@ -1896,60 +1873,6 @@ void __init setup_per_cpu_pageset()
 
 #endif
 
-static __devinit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-	int i;
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	/*
-	 * The per-page waitqueue mechanism uses hashed waitqueues
-	 * per zone.
-	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
-					* sizeof(wait_queue_head_t));
-
-	for(i = 0; i < zone->wait_table_size; ++i)
-		init_waitqueue_head(zone->wait_table + i);
-}
-
-static __devinit void zone_pcp_init(struct zone *zone)
-{
-	int cpu;
-	unsigned long batch = zone_batchsize(zone);
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-		/* Early boot. Slab allocator not functional yet */
-		zone->pageset[cpu] = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
-#else
-		setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-	}
-	printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-		zone->name, zone->present_pages, batch);
-}
-
-static __devinit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	zone_wait_table_init(zone, size);
-	pgdat->nr_zones = zone_idx(zone) + 1;
-
-	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
-	zone->zone_start_pfn = zone_start_pfn;
-
-	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
-	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
-}
-
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -1959,11 +1882,10 @@ static __devinit void init_currently_empty_zone(struct zone *zone,
 static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
-	unsigned long j;
-	int nid = pgdat->node_id;
+	unsigned long i, j;
+	int cpu, nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
-	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
@@ -1971,6 +1893,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
+		unsigned long batch;
 
 		realsize = size = zones_size[j];
 		if (zholes_size)
@@ -1985,13 +1908,24 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
-		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
-		zone_pcp_init(zone);
+		batch = zone_batchsize(zone);
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+			/* Early boot. Slab allocator not functional yet */
+			zone->pageset[cpu] = &boot_pageset[cpu];
+			setup_pageset(&boot_pageset[cpu],0);
+#else
+			setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+		}
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+				zone_names[j], realsize, batch);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
@@ -2002,9 +1936,32 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		if (!size)
 			continue;
 
+		/*
+		 * The per-page waitqueue mechanism uses hashed waitqueues
+		 * per zone.
+		 */
+		zone->wait_table_size = wait_table_size(size);
+		zone->wait_table_bits =
+			wait_table_bits(zone->wait_table_size);
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, zone->wait_table_size
+						* sizeof(wait_queue_head_t));
+
+		for(i = 0; i < zone->wait_table_size; ++i)
+			init_waitqueue_head(zone->wait_table + i);
+
+		pgdat->nr_zones = j+1;
+
+		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+		zone->zone_start_pfn = zone_start_pfn;
+
+		memmap_init(size, nid, j, zone_start_pfn);
+
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
+
 		zone_start_pfn += size;
+
+		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
 	}
 }
 
@@ -2404,7 +2361,7 @@ static void setup_per_zone_lowmem_reserve(void)
  *	that the pages_{min,low,high} values for each zone are set correctly 
  *	with respect to min_free_kbytes.
  */
-void setup_per_zone_pages_min(void)
+static void setup_per_zone_pages_min(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
diff --git a/trunk/mm/page_io.c b/trunk/mm/page_io.c
index bb2b0d53889c..330e00d6db00 100644
--- a/trunk/mm/page_io.c
+++ b/trunk/mm/page_io.c
@@ -91,8 +91,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		goto out;
 	}
-	bio = get_swap_bio(GFP_NOIO, page_private(page), page,
-				end_swap_bio_write);
+	bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
 	if (bio == NULL) {
 		set_page_dirty(page);
 		unlock_page(page);
@@ -116,8 +115,7 @@ int swap_readpage(struct file *file, struct page *page)
 
 	BUG_ON(!PageLocked(page));
 	ClearPageUptodate(page);
-	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
-				end_swap_bio_read);
+	bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
 		ret = -ENOMEM;
diff --git a/trunk/mm/rmap.c b/trunk/mm/rmap.c
index 914d04b98bee..450f5241b5a5 100644
--- a/trunk/mm/rmap.c
+++ b/trunk/mm/rmap.c
@@ -32,7 +32,7 @@
  *   page->flags PG_locked (lock_page)
  *     mapping->i_mmap_lock
  *       anon_vma->lock
- *         mm->page_table_lock or pte_lock
+ *         mm->page_table_lock
  *           zone->lru_lock (in mark_page_accessed)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,44 +244,37 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 /*
  * Check that @page is mapped at @address into @mm.
  *
- * On success returns with pte mapped and locked.
+ * On success returns with mapped pte and locked mm->page_table_lock.
  */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-			  unsigned long address, spinlock_t **ptlp)
+			  unsigned long address)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	spinlock_t *ptl;
 
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, address);
-	if (!pgd_present(*pgd))
-		return NULL;
-
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return NULL;
-
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return NULL;
-
-	pte = pte_offset_map(pmd, address);
-	/* Make a quick check before getting the lock */
-	if (!pte_present(*pte)) {
-		pte_unmap(pte);
-		return NULL;
-	}
-
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
-	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
-		*ptlp = ptl;
-		return pte;
+	if (likely(pgd_present(*pgd))) {
+		pud = pud_offset(pgd, address);
+		if (likely(pud_present(*pud))) {
+			pmd = pmd_offset(pud, address);
+			if (likely(pmd_present(*pmd))) {
+				pte = pte_offset_map(pmd, address);
+				if (likely(pte_present(*pte) &&
+					   page_to_pfn(page) == pte_pfn(*pte)))
+					return pte;
+				pte_unmap(pte);
+			}
+		}
 	}
-	pte_unmap_unlock(pte, ptl);
-	return NULL;
+	spin_unlock(&mm->page_table_lock);
+	return ERR_PTR(-ENOENT);
 }
 
 /*
@@ -294,28 +287,24 @@ static int page_referenced_one(struct page *page,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pte_t *pte;
-	spinlock_t *ptl;
 	int referenced = 0;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address, &ptl);
-	if (!pte)
-		goto out;
-
-	if (ptep_clear_flush_young(vma, address, pte))
-		referenced++;
+	pte = page_check_address(page, mm, address);
+	if (!IS_ERR(pte)) {
+		if (ptep_clear_flush_young(vma, address, pte))
+			referenced++;
 
-	/* Pretend the page is referenced if the task has the
-	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
-			rwsem_is_locked(&mm->mmap_sem))
-		referenced++;
+		if (mm != current->mm && !ignore_token && has_swap_token(mm))
+			referenced++;
 
-	(*mapcount)--;
-	pte_unmap_unlock(pte, ptl);
+		(*mapcount)--;
+		pte_unmap(pte);
+		spin_unlock(&mm->page_table_lock);
+	}
 out:
 	return referenced;
 }
@@ -445,11 +434,15 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  *
- * The caller needs to hold the pte lock.
+ * The caller needs to hold the mm->page_table_lock.
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
+	BUG_ON(PageReserved(page));
+
+	inc_mm_counter(vma->vm_mm, anon_rss);
+
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -468,12 +461,13 @@ void page_add_anon_rmap(struct page *page,
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
  *
- * The caller needs to hold the pte lock.
+ * The caller needs to hold the mm->page_table_lock.
  */
 void page_add_file_rmap(struct page *page)
 {
 	BUG_ON(PageAnon(page));
-	BUG_ON(!pfn_valid(page_to_pfn(page)));
+	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
+		return;
 
 	if (atomic_inc_and_test(&page->_mapcount))
 		inc_page_state(nr_mapped);
@@ -483,10 +477,12 @@ void page_add_file_rmap(struct page *page)
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
  *
- * The caller needs to hold the pte lock.
+ * Caller needs to hold the mm->page_table_lock.
  */
 void page_remove_rmap(struct page *page)
 {
+	BUG_ON(PageReserved(page));
+
 	if (atomic_add_negative(-1, &page->_mapcount)) {
 		BUG_ON(page_mapcount(page) < 0);
 		/*
@@ -514,15 +510,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
-	spinlock_t *ptl;
 	int ret = SWAP_AGAIN;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address, &ptl);
-	if (!pte)
+	pte = page_check_address(page, mm, address);
+	if (IS_ERR(pte))
 		goto out;
 
 	/*
@@ -546,11 +541,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
 	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page_private(page) };
+		swp_entry_t entry = { .val = page->private };
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
@@ -559,21 +551,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		swap_duplicate(entry);
 		if (list_empty(&mm->mmlist)) {
 			spin_lock(&mmlist_lock);
-			if (list_empty(&mm->mmlist))
-				list_add(&mm->mmlist, &init_mm.mmlist);
+			list_add(&mm->mmlist, &init_mm.mmlist);
 			spin_unlock(&mmlist_lock);
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 		dec_mm_counter(mm, anon_rss);
-	} else
-		dec_mm_counter(mm, file_rss);
+	}
 
+	dec_mm_counter(mm, rss);
 	page_remove_rmap(page);
 	page_cache_release(page);
 
 out_unmap:
-	pte_unmap_unlock(pte, ptl);
+	pte_unmap(pte);
+	spin_unlock(&mm->page_table_lock);
 out:
 	return ret;
 }
@@ -607,14 +599,19 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, *original_pte;
 	pte_t pteval;
-	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
 	unsigned long pfn;
 
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	spin_lock(&mm->page_table_lock);
+
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
 	if (address < vma->vm_start)
@@ -624,33 +621,30 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
-		return;
+		goto out_unlock;
 
 	pud = pud_offset(pgd, address);
 	if (!pud_present(*pud))
-		return;
+		goto out_unlock;
 
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
-		return;
-
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+		goto out_unlock;
 
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
+	for (original_pte = pte = pte_offset_map(pmd, address);
+			address < end; pte++, address += PAGE_SIZE) {
 
-	for (; address < end; pte++, address += PAGE_SIZE) {
 		if (!pte_present(*pte))
 			continue;
 
 		pfn = pte_pfn(*pte);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, *pte, address);
+		if (!pfn_valid(pfn))
 			continue;
-		}
 
 		page = pfn_to_page(pfn);
 		BUG_ON(PageAnon(page));
+		if (PageReserved(page))
+			continue;
 
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
@@ -669,10 +663,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
+		dec_mm_counter(mm, rss);
 		(*mapcount)--;
 	}
-	pte_unmap_unlock(pte - 1, ptl);
+
+	pte_unmap(original_pte);
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
 }
 
 static int try_to_unmap_anon(struct page *page)
@@ -809,6 +806,7 @@ int try_to_unmap(struct page *page)
 {
 	int ret;
 
+	BUG_ON(PageReserved(page));
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
diff --git a/trunk/mm/shmem.c b/trunk/mm/shmem.c
index dc25565a61e9..55e04a0734c1 100644
--- a/trunk/mm/shmem.c
+++ b/trunk/mm/shmem.c
@@ -71,6 +71,9 @@
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
+/* Keep swapped page count in private field of indirect struct page */
+#define nr_swapped		private
+
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
 	SGP_QUICK,	/* don't try more than file page cache lookup */
@@ -321,10 +324,8 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
 
 	entry->val = value;
 	info->swapped += incdec;
-	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
-		struct page *page = kmap_atomic_to_page(entry);
-		set_page_private(page, page_private(page) + incdec);
-	}
+	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
+		kmap_atomic_to_page(entry)->nr_swapped += incdec;
 }
 
 /*
@@ -367,8 +368,9 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 
 		spin_unlock(&info->lock);
 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
-		if (page)
-			set_page_private(page, 0);
+		if (page) {
+			page->nr_swapped = 0;
+		}
 		spin_lock(&info->lock);
 
 		if (!page) {
@@ -559,7 +561,7 @@ static void shmem_truncate(struct inode *inode)
 			diroff = 0;
 		}
 		subdir = dir[diroff];
-		if (subdir && page_private(subdir)) {
+		if (subdir && subdir->nr_swapped) {
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
 				size = ENTRIES_PER_PAGE;
@@ -570,10 +572,10 @@ static void shmem_truncate(struct inode *inode)
 			nr_swaps_freed += freed;
 			if (offset)
 				spin_lock(&info->lock);
-			set_page_private(subdir, page_private(subdir) - freed);
+			subdir->nr_swapped -= freed;
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(page_private(subdir) > offset);
+			BUG_ON(subdir->nr_swapped > offset);
 		}
 		if (offset)
 			offset = 0;
@@ -741,7 +743,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 			dir = shmem_dir_map(subdir);
 		}
 		subdir = *dir;
-		if (subdir && page_private(subdir)) {
+		if (subdir && subdir->nr_swapped) {
 			ptr = shmem_swp_map(subdir);
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
@@ -1199,7 +1201,7 @@ static int shmem_populate(struct vm_area_struct *vma,
 				page_cache_release(page);
 				return err;
 			}
-		} else if (vma->vm_flags & VM_NONLINEAR) {
+		} else {
 			/* No page was found just because we can't read it in
 			 * now (being here implies nonblock != 0), but the page
 			 * may exist, so set the PTE to fault it in later. */
@@ -1504,10 +1506,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 			 */
 			if (!offset)
 				mark_page_accessed(page);
-		} else {
+		} else
 			page = ZERO_PAGE(0);
-			page_cache_get(page);
-		}
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
diff --git a/trunk/mm/slab.c b/trunk/mm/slab.c
index 22bfb0b2ac8b..d30423f167a2 100644
--- a/trunk/mm/slab.c
+++ b/trunk/mm/slab.c
@@ -2419,7 +2419,6 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 			next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
 			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-			WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
 		       	slabp->free = next;
 		}
@@ -2634,10 +2633,8 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 
-#if DEBUG
-		/* Verify that the slab belongs to the intended node */
-		WARN_ON(slabp->nodeid != node);
 
+#if DEBUG
 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 			printk(KERN_ERR "slab: double free detected in cache "
 					"'%s', objp %p\n", cachep->name, objp);
diff --git a/trunk/mm/sparse.c b/trunk/mm/sparse.c
index 72079b538e2d..347249a4917a 100644
--- a/trunk/mm/sparse.c
+++ b/trunk/mm/sparse.c
@@ -5,10 +5,8 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
-#include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/vmalloc.h>
 #include <asm/dma.h>
 
 /*
@@ -74,31 +72,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 }
 #endif
 
-/*
- * Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
- * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
- */
-int __section_nr(struct mem_section* ms)
-{
-	unsigned long root_nr;
-	struct mem_section* root;
-
-	for (root_nr = 0;
-	     root_nr < NR_MEM_SECTIONS;
-	     root_nr += SECTIONS_PER_ROOT) {
-		root = __nr_to_section(root_nr);
-
-		if (!root)
-			continue;
-
-		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
-		     break;
-	}
-
-	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
-}
-
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -189,45 +162,6 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 	return NULL;
 }
 
-static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
-{
-	struct page *page, *ret;
-	unsigned long memmap_size = sizeof(struct page) * nr_pages;
-
-	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
-	if (page)
-		goto got_map_page;
-
-	ret = vmalloc(memmap_size);
-	if (ret)
-		goto got_map_ptr;
-
-	return NULL;
-got_map_page:
-	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
-got_map_ptr:
-	memset(ret, 0, memmap_size);
-
-	return ret;
-}
-
-static int vaddr_in_vmalloc_area(void *addr)
-{
-	if (addr >= (void *)VMALLOC_START &&
-	    addr < (void *)VMALLOC_END)
-		return 1;
-	return 0;
-}
-
-static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
-{
-	if (vaddr_in_vmalloc_area(memmap))
-		vfree(memmap);
-	else
-		free_pages((unsigned long)memmap,
-			   get_order(sizeof(struct page) * nr_pages));
-}
-
 /*
  * Allocate the accumulated non-linear sections, allocate a mem_map
  * for each and record the physical to section mapping.
@@ -253,37 +187,14 @@ void sparse_init(void)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
-			   int nr_pages)
+int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
 {
-	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	struct mem_section *ms;
-	struct page *memmap;
-	unsigned long flags;
-	int ret;
-
-	/*
-	 * no locking for this, because it does its own
-	 * plus, it does a kmalloc
-	 */
-	sparse_index_init(section_nr, pgdat->node_id);
-	memmap = __kmalloc_section_memmap(nr_pages);
+	struct mem_section *ms = __pfn_to_section(start_pfn);
 
-	pgdat_resize_lock(pgdat, &flags);
+	if (ms->section_mem_map & SECTION_MARKED_PRESENT)
+		return -EEXIST;
 
-	ms = __pfn_to_section(start_pfn);
-	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
-		ret = -EEXIST;
-		goto out;
-	}
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 
-	ret = sparse_init_one_section(ms, section_nr, memmap);
-
-	if (ret <= 0)
-		__kfree_section_memmap(memmap, nr_pages);
-out:
-	pgdat_resize_unlock(pgdat, &flags);
-	return ret;
+	return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
 }
diff --git a/trunk/mm/swap.c b/trunk/mm/swap.c
index b89512877ec2..7771d2803f62 100644
--- a/trunk/mm/swap.c
+++ b/trunk/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
 void put_page(struct page *page)
 {
 	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page_private(page);
+		page = (struct page *)page->private;
 		if (put_page_testzero(page)) {
 			void (*dtor)(struct page *page);
 
@@ -48,7 +48,7 @@ void put_page(struct page *page)
 		}
 		return;
 	}
-	if (put_page_testzero(page))
+	if (!PageReserved(page) && put_page_testzero(page))
 		__page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
 		struct page *page = pages[i];
 		struct zone *pagezone;
 
-		if (!put_page_testzero(page))
+		if (PageReserved(page) || !put_page_testzero(page))
 			continue;
 
 		pagezone = page_zone(page);
diff --git a/trunk/mm/swap_state.c b/trunk/mm/swap_state.c
index dfd9a46755b8..132164f7d0a7 100644
--- a/trunk/mm/swap_state.c
+++ b/trunk/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
-			set_page_private(page, entry.val);
+			page->private = entry.val;
 			total_swapcache_pages++;
 			pagecache_acct(1);
 		}
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page_private(page));
-	set_page_private(page, 0);
+	radix_tree_delete(&swapper_space.page_tree, page->private);
+	page->private = 0;
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
 	pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	entry.val = page_private(page);
+	entry.val = page->private;
 
 	write_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
@@ -259,7 +259,8 @@ static inline void free_swap_cache(struct page *page)
 
 /* 
  * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * this page if it is the last user of the page. Can not do a lock_page,
+ * as we are holding the page_table_lock spinlock.
  */
 void free_page_and_swap_cache(struct page *page)
 {
diff --git a/trunk/mm/swapfile.c b/trunk/mm/swapfile.c
index 8970c0b74194..1dcaeda039f4 100644
--- a/trunk/mm/swapfile.c
+++ b/trunk/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	swp_entry_t entry;
 
 	down_read(&swap_unplug_sem);
-	entry.val = page_private(page);
+	entry.val = page->private;
 	if (PageSwapCache(page)) {
 		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
 		struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 		/*
 		 * If the page is removed from swapcache from under us (with a
 		 * racy try_to_unuse/swapoff) we need an additional reference
-		 * count to avoid reading garbage from page_private(page) above.
-		 * If the WARN_ON triggers during a swapoff it maybe the race
+		 * count to avoid reading garbage from page->private above. If
+		 * the WARN_ON triggers during a swapoff it maybe the race
 		 * condition and it's harmless. However if it triggers without
 		 * swapoff it signals a problem.
 		 */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
 	struct swap_info_struct *p;
 	swp_entry_t entry;
 
-	entry.val = page_private(page);
+	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (page_count(page) != 2) /* 2: us + cache */
 		return 0;
 
-	entry.val = page_private(page);
+	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (!p)
 		return 0;
@@ -398,14 +398,17 @@ void free_swap_and_cache(swp_entry_t entry)
 }
 
 /*
- * No need to decide whether this PTE shares the swap entry with others,
- * just let do_wp_page work it out if a write is requested later - to
- * force COW, vm_page_prot omits write permission from any private vma.
+ * Always set the resulting pte to be nowrite (the same as COW pages
+ * after one process has exited).  We don't know just how many PTEs will
+ * share this swap entry, so be cautious and let do_wp_page work out
+ * what to do if a write is requested later.
+ *
+ * vma->vm_mm->page_table_lock is held.
  */
 static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
-	inc_mm_counter(vma->vm_mm, anon_rss);
+	inc_mm_counter(vma->vm_mm, rss);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -422,25 +425,23 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				swp_entry_t entry, struct page *page)
 {
-	pte_t swp_pte = swp_entry_to_pte(entry);
 	pte_t *pte;
-	spinlock_t *ptl;
-	int found = 0;
+	pte_t swp_pte = swp_entry_to_pte(entry);
 
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map(pmd, addr);
 	do {
 		/*
 		 * swapoff spends a _lot_ of time in this loop!
 		 * Test inline before going to call unuse_pte.
 		 */
 		if (unlikely(pte_same(*pte, swp_pte))) {
-			unuse_pte(vma, pte++, addr, entry, page);
-			found = 1;
-			break;
+			unuse_pte(vma, pte, addr, entry, page);
+			pte_unmap(pte);
+			return 1;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
-	return found;
+	pte_unmap(pte - 1);
+	return 0;
 }
 
 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -522,10 +523,12 @@ static int unuse_mm(struct mm_struct *mm,
 		down_read(&mm->mmap_sem);
 		lock_page(page);
 	}
+	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (vma->anon_vma && unuse_vma(vma, entry, page))
 			break;
 	}
+	spin_unlock(&mm->page_table_lock);
 	up_read(&mm->mmap_sem);
 	/*
 	 * Currently unuse_mm cannot fail, but leave error handling
@@ -1042,7 +1045,7 @@ int page_queue_congested(struct page *page)
 	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
 
 	if (PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page_private(page) };
+		swp_entry_t entry = { .val = page->private };
 		struct swap_info_struct *sis;
 
 		sis = get_swap_info_struct(swp_type(entry));
diff --git a/trunk/mm/thrash.c b/trunk/mm/thrash.c
index eff3c18c33a1..11461f7ad830 100644
--- a/trunk/mm/thrash.c
+++ b/trunk/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
 struct mm_struct * swap_token_mm = &init_mm;
 
 #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT	(300 * HZ)
+#define SWAP_TOKEN_TIMEOUT	0
 /*
  * Currently disabled; Needs further code to work at HZ * 300.
  */
diff --git a/trunk/mm/vmalloc.c b/trunk/mm/vmalloc.c
index 54a90e83cb31..1150229b6366 100644
--- a/trunk/mm/vmalloc.c
+++ b/trunk/mm/vmalloc.c
@@ -5,7 +5,6 @@
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
- *  Numa awareness, Christoph Lameter, SGI, June 2005
  */
 
 #include <linux/mm.h>
@@ -89,7 +88,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
 {
 	pte_t *pte;
 
-	pte = pte_alloc_kernel(pmd, addr);
+	pte = pte_alloc_kernel(&init_mm, pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -147,18 +146,20 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
+	spin_lock(&init_mm.page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = vmap_pud_range(pgd, addr, next, prot, pages);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&init_mm.page_table_lock);
 	flush_cache_vmap((unsigned long) area->addr, end);
 	return err;
 }
 
-struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
-				unsigned long start, unsigned long end, int node)
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+				unsigned long start, unsigned long end)
 {
 	struct vm_struct **p, *tmp, *area;
 	unsigned long align = 1;
@@ -177,7 +178,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
 	addr = ALIGN(start, align);
 	size = PAGE_ALIGN(size);
 
-	area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
+	area = kmalloc(sizeof(*area), GFP_KERNEL);
 	if (unlikely(!area))
 		return NULL;
 
@@ -230,12 +231,6 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
 	return NULL;
 }
 
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
-				unsigned long start, unsigned long end)
-{
-	return __get_vm_area_node(size, flags, start, end, -1);
-}
-
 /**
  *	get_vm_area  -  reserve a contingous kernel virtual area
  *
@@ -251,11 +246,6 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
 }
 
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
-{
-	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
-}
-
 /* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
@@ -352,6 +342,7 @@ void vfree(void *addr)
 	BUG_ON(in_interrupt());
 	__vunmap(addr, 1);
 }
+
 EXPORT_SYMBOL(vfree);
 
 /**
@@ -369,6 +360,7 @@ void vunmap(void *addr)
 	BUG_ON(in_interrupt());
 	__vunmap(addr, 0);
 }
+
 EXPORT_SYMBOL(vunmap);
 
 /**
@@ -400,10 +392,10 @@ void *vmap(struct page **pages, unsigned int count,
 
 	return area->addr;
 }
+
 EXPORT_SYMBOL(vmap);
 
-void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-				pgprot_t prot, int node)
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
@@ -414,9 +406,9 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE)
-		pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
+		pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
 	else
-		pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
+		pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
 	area->pages = pages;
 	if (!area->pages) {
 		remove_vm_area(area->addr);
@@ -426,10 +418,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	memset(area->pages, 0, array_size);
 
 	for (i = 0; i < area->nr_pages; i++) {
-		if (node < 0)
-			area->pages[i] = alloc_page(gfp_mask);
-		else
-			area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
+		area->pages[i] = alloc_page(gfp_mask);
 		if (unlikely(!area->pages[i])) {
 			/* Successfully allocated i pages, free them in __vunmap() */
 			area->nr_pages = i;
@@ -446,25 +435,18 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return NULL;
 }
 
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
-	return __vmalloc_area_node(area, gfp_mask, prot, -1);
-}
-
 /**
- *	__vmalloc_node  -  allocate virtually contiguous memory
+ *	__vmalloc  -  allocate virtually contiguous memory
  *
  *	@size:		allocation size
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
- *	@node		node to use for allocation or -1
  *
  *	Allocate enough pages to cover @size from the page level
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
  */
-void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			int node)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
 	struct vm_struct *area;
 
@@ -472,18 +454,13 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
 		return NULL;
 
-	area = get_vm_area_node(size, VM_ALLOC, node);
+	area = get_vm_area(size, VM_ALLOC);
 	if (!area)
 		return NULL;
 
-	return __vmalloc_area_node(area, gfp_mask, prot, node);
+	return __vmalloc_area(area, gfp_mask, prot);
 }
-EXPORT_SYMBOL(__vmalloc_node);
 
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-{
-	return __vmalloc_node(size, gfp_mask, prot, -1);
-}
 EXPORT_SYMBOL(__vmalloc);
 
 /**
@@ -501,25 +478,8 @@ void *vmalloc(unsigned long size)
 {
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
-EXPORT_SYMBOL(vmalloc);
 
-/**
- *	vmalloc_node  -  allocate memory on a specific node
- *
- *	@size:		allocation size
- *	@node;		numa node
- *
- *	Allocate enough pages to cover @size from the page level
- *	allocator and map them into contiguous kernel virtual space.
- *
- *	For tight cotrol over page level allocator and protection flags
- *	use __vmalloc() instead.
- */
-void *vmalloc_node(unsigned long size, int node)
-{
-       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
-}
-EXPORT_SYMBOL(vmalloc_node);
+EXPORT_SYMBOL(vmalloc);
 
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
@@ -555,6 +515,7 @@ void *vmalloc_32(unsigned long size)
 {
 	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
 }
+
 EXPORT_SYMBOL(vmalloc_32);
 
 long vread(char *buf, char *addr, unsigned long count)
diff --git a/trunk/mm/vmscan.c b/trunk/mm/vmscan.c
index 135bf8ca96ee..843c87d1e61f 100644
--- a/trunk/mm/vmscan.c
+++ b/trunk/mm/vmscan.c
@@ -417,9 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
-				goto keep_locked;
+		if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
 			if (!add_to_swap(page))
 				goto activate_locked;
 		}
@@ -521,7 +519,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page_private(page) };
+			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
diff --git a/trunk/sound/core/pcm_native.c b/trunk/sound/core/pcm_native.c
index e97b2d162cc7..67abebabf83e 100644
--- a/trunk/sound/core/pcm_native.c
+++ b/trunk/sound/core/pcm_native.c
@@ -2949,7 +2949,8 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->status);
-	get_page(page);
+	if (!PageReserved(page))
+		get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -2991,7 +2992,8 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->control);
-	get_page(page);
+	if (!PageReserved(page))
+		get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -3064,7 +3066,8 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
 		vaddr = runtime->dma_area + offset;
 		page = virt_to_page(vaddr);
 	}
-	get_page(page);
+	if (!PageReserved(page))
+		get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;