From 4d4e4db03669c0331637e2fe96d31d2937ffe4b8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Sep 2005 14:41:40 +1000 Subject: [PATCH] --- yaml --- r: 11400 b: refs/heads/master c: d32311fed70d12f14e585feb4653571b1e2b0e6d h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/Documentation/cachetlb.txt | 9 + trunk/Documentation/kernel-parameters.txt | 2 + trunk/Documentation/m68k/kernel-options.txt | 24 +- trunk/arch/alpha/mm/numa.c | 3 - trunk/arch/alpha/mm/remap.c | 6 +- trunk/arch/arm/kernel/signal.c | 96 +- trunk/arch/arm/kernel/traps.c | 14 +- trunk/arch/arm/mm/consistent.c | 6 +- trunk/arch/arm/mm/fault-armv.c | 7 +- trunk/arch/arm/mm/ioremap.c | 4 +- trunk/arch/arm/mm/mm-armv.c | 15 +- trunk/arch/arm/oprofile/backtrace.c | 46 +- trunk/arch/arm26/mm/memc.c | 18 +- trunk/arch/cris/arch-v32/mm/tlb.c | 6 +- trunk/arch/cris/mm/ioremap.c | 4 +- trunk/arch/frv/mm/dma-alloc.c | 5 +- trunk/arch/frv/mm/pgalloc.c | 4 +- trunk/arch/i386/kernel/vm86.c | 17 +- trunk/arch/i386/mm/discontig.c | 4 +- trunk/arch/i386/mm/init.c | 62 +- trunk/arch/i386/mm/ioremap.c | 4 +- trunk/arch/i386/mm/pgtable.c | 11 +- trunk/arch/i386/oprofile/backtrace.c | 38 +- trunk/arch/ia64/kernel/perfmon.c | 3 +- trunk/arch/ia64/mm/discontig.c | 7 +- trunk/arch/ia64/mm/fault.c | 34 +- trunk/arch/ia64/mm/init.c | 13 +- trunk/arch/ia64/mm/tlb.c | 2 - trunk/arch/m32r/mm/init.c | 9 +- trunk/arch/m32r/mm/ioremap.c | 4 +- trunk/arch/m68k/Kconfig | 24 +- trunk/arch/m68k/atari/stram.c | 918 +++++++++++++++++- trunk/arch/m68k/mm/kmap.c | 2 +- trunk/arch/m68k/sun3x/dvma.c | 2 +- trunk/arch/mips/kernel/irixelf.c | 1 + trunk/arch/mips/mm/ioremap.c | 4 +- trunk/arch/parisc/kernel/cache.c | 24 +- trunk/arch/parisc/kernel/pci-dma.c | 2 +- trunk/arch/parisc/mm/init.c | 3 - trunk/arch/parisc/mm/ioremap.c | 6 +- trunk/arch/ppc/kernel/dma-mapping.c | 6 +- trunk/arch/ppc/mm/4xx_mmu.c | 4 + trunk/arch/ppc/mm/pgtable.c | 4 +- trunk/arch/ppc64/kernel/vdso.c | 12 +- trunk/arch/ppc64/mm/imalloc.c | 5 + trunk/arch/ppc64/mm/init.c | 87 +- trunk/arch/s390/mm/ioremap.c | 4 +- trunk/arch/sh/mm/fault.c | 40 +- trunk/arch/sh/mm/hugetlbpage.c | 2 + trunk/arch/sh/mm/ioremap.c | 4 +- trunk/arch/sh64/mm/cache.c | 68 +- trunk/arch/sh64/mm/hugetlbpage.c | 188 +++- trunk/arch/sh64/mm/ioremap.c | 4 +- trunk/arch/sparc/mm/generic.c | 7 +- trunk/arch/sparc64/kernel/binfmt_aout32.c | 1 + trunk/arch/sparc64/mm/generic.c | 9 +- trunk/arch/sparc64/mm/tlb.c | 7 +- trunk/arch/um/include/tlb.h | 1 + trunk/arch/um/kernel/process_kern.c | 8 +- trunk/arch/um/kernel/skas/mmu.c | 4 +- trunk/arch/um/kernel/tt/tlb.c | 36 + trunk/arch/x86_64/ia32/ia32_aout.c | 1 + trunk/arch/x86_64/mm/ioremap.c | 4 +- trunk/drivers/acpi/acpi_memhotplug.c | 5 +- trunk/drivers/base/Makefile | 1 - trunk/drivers/base/init.c | 2 - trunk/drivers/base/memory.c | 452 --------- trunk/drivers/scsi/sg.c | 12 +- trunk/drivers/scsi/st.c | 10 +- trunk/fs/afs/file.c | 4 +- trunk/fs/binfmt_aout.c | 1 + trunk/fs/binfmt_elf.c | 1 + trunk/fs/binfmt_elf_fdpic.c | 7 + trunk/fs/binfmt_flat.c | 1 + trunk/fs/binfmt_som.c | 1 + trunk/fs/buffer.c | 2 +- trunk/fs/compat.c | 1 + trunk/fs/direct-io.c | 4 +- trunk/fs/exec.c | 17 +- trunk/fs/hugetlbfs/inode.c | 206 ++-- trunk/fs/jfs/jfs_metapage.c | 12 +- trunk/fs/proc/array.c | 2 +- trunk/fs/proc/task_mmu.c | 51 +- trunk/fs/xfs/linux-2.6/xfs_buf.c | 7 +- trunk/include/asm-alpha/barrier.h | 2 - trunk/include/asm-alpha/rwsem.h | 5 - trunk/include/asm-arm/tlb.h | 23 +- trunk/include/asm-arm26/tlb.h | 47 +- trunk/include/asm-generic/4level-fixup.h | 11 +- trunk/include/asm-generic/pgtable.h | 2 +- trunk/include/asm-generic/tlb.h | 23 +- trunk/include/asm-i386/mmzone.h | 6 + trunk/include/asm-i386/pgtable.h | 3 +- trunk/include/asm-i386/rwsem.h | 5 - trunk/include/asm-ia64/rwsem.h | 5 - trunk/include/asm-ia64/tlb.h | 19 +- trunk/include/asm-m32r/mmzone.h | 6 + trunk/include/asm-parisc/cacheflush.h | 35 +- trunk/include/asm-parisc/mmzone.h | 6 + trunk/include/asm-parisc/tlbflush.h | 3 +- trunk/include/asm-ppc/rwsem.h | 5 - trunk/include/asm-ppc64/mmzone.h | 3 + trunk/include/asm-ppc64/pgtable.h | 4 +- trunk/include/asm-ppc64/rwsem.h | 5 - trunk/include/asm-s390/rwsem.h | 5 - trunk/include/asm-sh/rwsem.h | 5 - trunk/include/asm-sparc64/rwsem.h | 5 - trunk/include/asm-sparc64/tlb.h | 29 +- trunk/include/asm-um/pgtable.h | 2 +- trunk/include/asm-x86_64/rwsem.h | 5 - trunk/include/linux/buffer_head.h | 6 +- trunk/include/linux/hugetlb.h | 2 + trunk/include/linux/memory.h | 94 -- trunk/include/linux/memory_hotplug.h | 104 -- trunk/include/linux/mempolicy.h | 7 +- trunk/include/linux/mm.h | 150 ++- trunk/include/linux/mmzone.h | 28 - trunk/include/linux/rmap.h | 4 +- trunk/include/linux/rwsem-spinlock.h | 5 - trunk/include/linux/scatterlist.h | 17 +- trunk/include/linux/sched.h | 65 +- trunk/include/linux/vmalloc.h | 8 +- trunk/ipc/shm.c | 7 +- trunk/kernel/acct.c | 2 +- trunk/kernel/exit.c | 5 +- trunk/kernel/fork.c | 31 +- trunk/kernel/futex.c | 6 +- trunk/kernel/kexec.c | 4 +- trunk/kernel/power/swsusp.c | 25 +- trunk/kernel/sched.c | 2 + trunk/kernel/timer.c | 9 - trunk/mm/Kconfig | 21 - trunk/mm/Makefile | 2 +- trunk/mm/bootmem.c | 1 - trunk/mm/filemap.c | 12 +- trunk/mm/filemap_xip.c | 22 +- trunk/mm/fremap.c | 86 +- trunk/mm/hugetlb.c | 207 ++-- trunk/mm/madvise.c | 2 +- trunk/mm/memory.c | 993 ++++++++++---------- trunk/mm/memory_hotplug.c | 138 --- trunk/mm/mempolicy.c | 393 ++++---- trunk/mm/mmap.c | 126 ++- trunk/mm/mprotect.c | 19 +- trunk/mm/mremap.c | 193 ++-- trunk/mm/msync.c | 78 +- trunk/mm/nommu.c | 18 +- trunk/mm/page_alloc.c | 207 ++-- trunk/mm/page_io.c | 6 +- trunk/mm/rmap.c | 146 ++- trunk/mm/shmem.c | 28 +- trunk/mm/slab.c | 5 +- trunk/mm/sparse.c | 99 +- trunk/mm/swap.c | 6 +- trunk/mm/swap_state.c | 11 +- trunk/mm/swapfile.c | 41 +- trunk/mm/thrash.c | 2 +- trunk/mm/vmalloc.c | 77 +- trunk/mm/vmscan.c | 6 +- trunk/sound/core/pcm_native.c | 9 +- 161 files changed, 3352 insertions(+), 3222 deletions(-) delete mode 100644 trunk/drivers/base/memory.c delete mode 100644 trunk/include/linux/memory.h delete mode 100644 trunk/include/linux/memory_hotplug.h delete mode 100644 trunk/mm/memory_hotplug.c diff --git a/[refs] b/[refs] index d3ea6aca5ff5..f4dd7eca2155 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: b1459461f1e0abd5c28317d6bff6f2ca612a719d +refs/heads/master: d32311fed70d12f14e585feb4653571b1e2b0e6d diff --git a/trunk/Documentation/cachetlb.txt b/trunk/Documentation/cachetlb.txt index 7eb715e07eda..e132fb1163b0 100644 --- a/trunk/Documentation/cachetlb.txt +++ b/trunk/Documentation/cachetlb.txt @@ -49,6 +49,9 @@ changes occur: page table operations such as what happens during fork, and exec. + Platform developers note that generic code will always + invoke this interface without mm->page_table_lock held. + 3) void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -69,6 +72,9 @@ changes occur: call flush_tlb_page (see below) for each entry which may be modified. + Platform developers note that generic code will always + invoke this interface with mm->page_table_lock held. + 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) This time we need to remove the PAGE_SIZE sized translation @@ -87,6 +93,9 @@ changes occur: This is used primarily during fault processing. + Platform developers note that generic code will always + invoke this interface with mm->page_table_lock held. + 5) void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index 5dffcfefc3c7..90766b75d1b7 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -1460,6 +1460,8 @@ running once the system is up. stifb= [HW] Format: bpp:[:[:...]] + stram_swap= [HW,M68k] + swiotlb= [IA-64] Number of I/O TLB slabs switches= [HW,M68k] diff --git a/trunk/Documentation/m68k/kernel-options.txt b/trunk/Documentation/m68k/kernel-options.txt index d5d3f064f552..e191baad8308 100644 --- a/trunk/Documentation/m68k/kernel-options.txt +++ b/trunk/Documentation/m68k/kernel-options.txt @@ -626,7 +626,7 @@ ignored (others aren't affected). can be performed in optimal order. Not all SCSI devices support tagged queuing (:-(). -4.5 switches= +4.6 switches= ------------- Syntax: switches= @@ -661,6 +661,28 @@ correctly. earlier initialization ("ov_"-less) takes precedence. But the switching-off on reset still happens in this case. +4.5) stram_swap= +---------------- + +Syntax: stram_swap=[,] + + This option is available only if the kernel has been compiled with +CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines +dynamically whether to actually use ST-RAM as swap space. (Currently, +the fraction of ST-RAM must be less or equal 1/3 of total memory to +enable this swapping.) You can override the kernel's decision by +specifying this option. 1 for means always enable the swap, +even if you have less alternate RAM. 0 stands for never swap to +ST-RAM, even if it's small enough compared to the rest of memory. + + If ST-RAM swapping is enabled, the kernel usually uses all free +ST-RAM as swap "device". If the kernel resides in ST-RAM, the region +allocated by it is obviously never used for swapping :-) You can also +limit this amount by specifying the second parameter, , if +you want to use parts of ST-RAM as normal system memory. is +in kBytes and the number should be a multiple of 4 (otherwise: rounded +down). + 5) Options for Amiga Only: ========================== diff --git a/trunk/arch/alpha/mm/numa.c b/trunk/arch/alpha/mm/numa.c index 6d5251254f68..c7481d59b6df 100644 --- a/trunk/arch/alpha/mm/numa.c +++ b/trunk/arch/alpha/mm/numa.c @@ -371,8 +371,6 @@ show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { - unsigned long flags; - pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -386,7 +384,6 @@ show_mem(void) else shared += page_count(page) - 1; } - pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); diff --git a/trunk/arch/alpha/mm/remap.c b/trunk/arch/alpha/mm/remap.c index a78356c3ead5..19817ad3d89b 100644 --- a/trunk/arch/alpha/mm/remap.c +++ b/trunk/arch/alpha/mm/remap.c @@ -2,6 +2,7 @@ #include #include +/* called with the page_table_lock held */ static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) @@ -30,6 +31,7 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, } while (address && (address < end)); } +/* called with the page_table_lock held */ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) @@ -44,7 +46,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, @@ -68,6 +70,7 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -81,6 +84,7 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); return error; } diff --git a/trunk/arch/arm/kernel/signal.c b/trunk/arch/arm/kernel/signal.c index a917e3dd3666..a94d75fef598 100644 --- a/trunk/arch/arm/kernel/signal.c +++ b/trunk/arch/arm/kernel/signal.c @@ -139,33 +139,93 @@ struct iwmmxt_sigframe { unsigned long storage[0x98/4]; }; +static int page_present(struct mm_struct *mm, void __user *uptr, int wr) +{ + unsigned long addr = (unsigned long)uptr; + pgd_t *pgd = pgd_offset(mm, addr); + if (pgd_present(*pgd)) { + pmd_t *pmd = pmd_offset(pgd, addr); + if (pmd_present(*pmd)) { + pte_t *pte = pte_offset_map(pmd, addr); + return (pte_present(*pte) && (!wr || pte_write(*pte))); + } + } + return 0; +} + +static int copy_locked(void __user *uptr, void *kptr, size_t size, int write, + void (*copyfn)(void *, void __user *)) +{ + unsigned char v, __user *userptr = uptr; + int err = 0; + + do { + struct mm_struct *mm; + + if (write) { + __put_user_error(0, userptr, err); + __put_user_error(0, userptr + size - 1, err); + } else { + __get_user_error(v, userptr, err); + __get_user_error(v, userptr + size - 1, err); + } + + if (err) + break; + + mm = current->mm; + spin_lock(&mm->page_table_lock); + if (page_present(mm, userptr, write) && + page_present(mm, userptr + size - 1, write)) { + copyfn(kptr, uptr); + } else + err = 1; + spin_unlock(&mm->page_table_lock); + } while (err); + + return err; +} + static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame) { - char kbuf[sizeof(*frame) + 8]; - struct iwmmxt_sigframe *kframe; + int err = 0; /* the iWMMXt context must be 64 bit aligned */ - kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7); - kframe->magic0 = IWMMXT_MAGIC0; - kframe->magic1 = IWMMXT_MAGIC1; - iwmmxt_task_copy(current_thread_info(), &kframe->storage); - return __copy_to_user(frame, kframe, sizeof(*frame)); + WARN_ON((unsigned long)frame & 7); + + __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err); + __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err); + + /* + * iwmmxt_task_copy() doesn't check user permissions. + * Let's do a dummy write on the upper boundary to ensure + * access to user mem is OK all way up. + */ + err |= copy_locked(&frame->storage, current_thread_info(), + sizeof(frame->storage), 1, iwmmxt_task_copy); + return err; } static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame) { - char kbuf[sizeof(*frame) + 8]; - struct iwmmxt_sigframe *kframe; + unsigned long magic0, magic1; + int err = 0; - /* the iWMMXt context must be 64 bit aligned */ - kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7); - if (__copy_from_user(kframe, frame, sizeof(*frame))) - return -1; - if (kframe->magic0 != IWMMXT_MAGIC0 || - kframe->magic1 != IWMMXT_MAGIC1) - return -1; - iwmmxt_task_restore(current_thread_info(), &kframe->storage); - return 0; + /* the iWMMXt context is 64 bit aligned */ + WARN_ON((unsigned long)frame & 7); + + /* + * Validate iWMMXt context signature. + * Also, iwmmxt_task_restore() doesn't check user permissions. + * Let's do a dummy write on the upper boundary to ensure + * access to user mem is OK all way up. + */ + __get_user_error(magic0, &frame->magic0, err); + __get_user_error(magic1, &frame->magic1, err); + if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1) + err = copy_locked(&frame->storage, current_thread_info(), + sizeof(frame->storage), 0, iwmmxt_task_restore); + return err; } #endif diff --git a/trunk/arch/arm/kernel/traps.c b/trunk/arch/arm/kernel/traps.c index 66e5a0516f23..baa09601a64e 100644 --- a/trunk/arch/arm/kernel/traps.c +++ b/trunk/arch/arm/kernel/traps.c @@ -483,33 +483,29 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) unsigned long addr = regs->ARM_r2; struct mm_struct *mm = current->mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - spinlock_t *ptl; regs->ARM_cpsr &= ~PSR_C_BIT; - down_read(&mm->mmap_sem); + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) goto bad_access; pmd = pmd_offset(pgd, addr); if (!pmd_present(*pmd)) goto bad_access; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!pte_present(*pte) || !pte_write(*pte)) { - pte_unmap_unlock(pte, ptl); + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte) || !pte_write(*pte)) goto bad_access; - } val = *(unsigned long *)addr; val -= regs->ARM_r0; if (val == 0) { *(unsigned long *)addr = regs->ARM_r1; regs->ARM_cpsr |= PSR_C_BIT; } - pte_unmap_unlock(pte, ptl); - up_read(&mm->mmap_sem); + spin_unlock(&mm->page_table_lock); return val; bad_access: - up_read(&mm->mmap_sem); + spin_unlock(&mm->page_table_lock); /* simulate a write access fault */ do_DataAbort(addr, 15 + (1 << 11), regs); return -1; diff --git a/trunk/arch/arm/mm/consistent.c b/trunk/arch/arm/mm/consistent.c index 47b0b767f080..82f4d5e27c54 100644 --- a/trunk/arch/arm/mm/consistent.c +++ b/trunk/arch/arm/mm/consistent.c @@ -397,6 +397,8 @@ static int __init consistent_init(void) pte_t *pte; int ret = 0; + spin_lock(&init_mm.page_table_lock); + do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); @@ -407,7 +409,7 @@ static int __init consistent_init(void) } WARN_ON(!pmd_none(*pmd)); - pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); + pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); if (!pte) { printk(KERN_ERR "%s: no pte tables\n", __func__); ret = -ENOMEM; @@ -417,6 +419,8 @@ static int __init consistent_init(void) consistent_pte = pte; } while (0); + spin_unlock(&init_mm.page_table_lock); + return ret; } diff --git a/trunk/arch/arm/mm/fault-armv.c b/trunk/arch/arm/mm/fault-armv.c index 7fc1b35a6746..be4ab3d73c91 100644 --- a/trunk/arch/arm/mm/fault-armv.c +++ b/trunk/arch/arm/mm/fault-armv.c @@ -26,11 +26,6 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE; /* * We take the easy way out of this problem - we make the * PTE uncacheable. However, we leave the write buffer on. - * - * Note that the pte lock held when calling update_mmu_cache must also - * guard the pte (somewhere else in the same mm) that we modify here. - * Therefore those configurations which might call adjust_pte (those - * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock. */ static int adjust_pte(struct vm_area_struct *vma, unsigned long address) { @@ -132,7 +127,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page); * 2. If we have multiple shared mappings of the same space in * an object, we need to deal with the cache aliasing issues. * - * Note that the pte lock will be held. + * Note that the page_table_lock will be held. */ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { diff --git a/trunk/arch/arm/mm/ioremap.c b/trunk/arch/arm/mm/ioremap.c index 0f128c28fee4..6fb1258df1b5 100644 --- a/trunk/arch/arm/mm/ioremap.c +++ b/trunk/arch/arm/mm/ioremap.c @@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, pgprot); @@ -97,6 +97,7 @@ remap_area_pages(unsigned long start, unsigned long phys_addr, phys_addr -= address; dir = pgd_offset(&init_mm, address); BUG_ON(address >= end); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd = pmd_alloc(&init_mm, dir, address); if (!pmd) { @@ -113,6 +114,7 @@ remap_area_pages(unsigned long start, unsigned long phys_addr, dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_cache_vmap(start, end); return err; } diff --git a/trunk/arch/arm/mm/mm-armv.c b/trunk/arch/arm/mm/mm-armv.c index 1221fdde1769..61bc2fa0511e 100644 --- a/trunk/arch/arm/mm/mm-armv.c +++ b/trunk/arch/arm/mm/mm-armv.c @@ -179,6 +179,11 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); if (!vectors_high()) { + /* + * This lock is here just to satisfy pmd_alloc and pte_lock + */ + spin_lock(&mm->page_table_lock); + /* * On ARM, first page must always be allocated since it * contains the machine vectors. @@ -196,14 +201,23 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) set_pte(new_pte, *init_pte); pte_unmap_nested(init_pte); pte_unmap(new_pte); + + spin_unlock(&mm->page_table_lock); } return new_pgd; no_pte: + spin_unlock(&mm->page_table_lock); pmd_free(new_pmd); + free_pages((unsigned long)new_pgd, 2); + return NULL; + no_pmd: + spin_unlock(&mm->page_table_lock); free_pages((unsigned long)new_pgd, 2); + return NULL; + no_pgd: return NULL; } @@ -229,7 +243,6 @@ void free_pgd_slow(pgd_t *pgd) pte = pmd_page(*pmd); pmd_clear(pmd); dec_page_state(nr_page_table_pages); - pte_lock_deinit(pte); pte_free(pte); pmd_free(pmd); free: diff --git a/trunk/arch/arm/oprofile/backtrace.c b/trunk/arch/arm/oprofile/backtrace.c index 7c22c12618cc..df35c452a8bf 100644 --- a/trunk/arch/arm/oprofile/backtrace.c +++ b/trunk/arch/arm/oprofile/backtrace.c @@ -49,22 +49,42 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail) static struct frame_tail* user_backtrace(struct frame_tail *tail) { - struct frame_tail buftail[2]; + struct frame_tail buftail; - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) - return NULL; - if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) + /* hardware pte might not be valid due to dirty/accessed bit emulation + * so we use copy_from_user and benefit from exception fixups */ + if (copy_from_user(&buftail, tail, sizeof(struct frame_tail))) return NULL; - oprofile_add_trace(buftail[0].lr); + oprofile_add_trace(buftail.lr); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (tail >= buftail[0].fp) + if (tail >= buftail.fp) return NULL; - return buftail[0].fp-1; + return buftail.fp-1; +} + +/* Compare two addresses and see if they're on the same page */ +#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \ + == ((((unsigned long) y) + offset) >> PAGE_SHIFT)) + +/* check that the page(s) containing the frame tail are present */ +static int pages_present(struct frame_tail *tail) +{ + struct mm_struct * mm = current->mm; + + if (!check_user_page_readable(mm, (unsigned long)tail)) + return 0; + + if (CMP_ADDR_EQUAL(tail, tail, 8)) + return 1; + + if (!check_user_page_readable(mm, ((unsigned long)tail) + 8)) + return 0; + + return 1; } /* @@ -98,6 +118,7 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs) void arm_backtrace(struct pt_regs * const regs, unsigned int depth) { struct frame_tail *tail; + unsigned long last_address = 0; tail = ((struct frame_tail *) regs->ARM_fp) - 1; @@ -111,6 +132,13 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth) return; } - while (depth-- && tail && !((unsigned long) tail & 3)) + while (depth-- && tail && !((unsigned long) tail & 3)) { + if ((!CMP_ADDR_EQUAL(last_address, tail, 0) + || !CMP_ADDR_EQUAL(last_address, tail, 8)) + && !pages_present(tail)) + return; + last_address = (unsigned long) tail; tail = user_backtrace(tail); + } } + diff --git a/trunk/arch/arm26/mm/memc.c b/trunk/arch/arm26/mm/memc.c index 34def6397c3c..8e8a2bb2487d 100644 --- a/trunk/arch/arm26/mm/memc.c +++ b/trunk/arch/arm26/mm/memc.c @@ -78,6 +78,12 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) if (!new_pgd) goto no_pgd; + /* + * This lock is here just to satisfy pmd_alloc and pte_lock + * FIXME: I bet we could avoid taking it pretty much altogether + */ + spin_lock(&mm->page_table_lock); + /* * On ARM, first page must always be allocated since it contains * the machine vectors. @@ -86,7 +92,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); + new_pte = pte_alloc_kernel(mm, new_pmd, 0); if (!new_pte) goto no_pte; @@ -95,7 +101,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) init_pte = pte_offset(init_pmd, 0); set_pte(new_pte, *init_pte); - pte_unmap(new_pte); /* * the page table entries are zeroed @@ -107,14 +112,23 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); + spin_unlock(&mm->page_table_lock); + /* update MEMC tables */ cpu_memc_update_all(new_pgd); return new_pgd; no_pte: + spin_unlock(&mm->page_table_lock); pmd_free(new_pmd); + free_pgd_slow(new_pgd); + return NULL; + no_pmd: + spin_unlock(&mm->page_table_lock); free_pgd_slow(new_pgd); + return NULL; + no_pgd: return NULL; } diff --git a/trunk/arch/cris/arch-v32/mm/tlb.c b/trunk/arch/cris/arch-v32/mm/tlb.c index b08a28bb58ab..8233406798d3 100644 --- a/trunk/arch/cris/arch-v32/mm/tlb.c +++ b/trunk/arch/cris/arch-v32/mm/tlb.c @@ -175,8 +175,6 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) return 0; } -static DEFINE_SPINLOCK(mmu_context_lock); - /* Called in schedule() just before actually doing the switch_to. */ void switch_mm(struct mm_struct *prev, struct mm_struct *next, @@ -185,10 +183,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, int cpu = smp_processor_id(); /* Make sure there is a MMU context. */ - spin_lock(&mmu_context_lock); + spin_lock(&next->page_table_lock); get_mmu_context(next); cpu_set(cpu, next->cpu_vm_mask); - spin_unlock(&mmu_context_lock); + spin_unlock(&next->page_table_lock); /* * Remember the pgd for the fault handlers. Keep a seperate copy of it diff --git a/trunk/arch/cris/mm/ioremap.c b/trunk/arch/cris/mm/ioremap.c index a92ac9877582..ebba11e270fa 100644 --- a/trunk/arch/cris/mm/ioremap.c +++ b/trunk/arch/cris/mm/ioremap.c @@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, prot); @@ -74,6 +74,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pud_t *pud; pmd_t *pmd; @@ -93,6 +94,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/trunk/arch/frv/mm/dma-alloc.c b/trunk/arch/frv/mm/dma-alloc.c index 342823aad758..cfc4f97490c6 100644 --- a/trunk/arch/frv/mm/dma-alloc.c +++ b/trunk/arch/frv/mm/dma-alloc.c @@ -55,18 +55,21 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot) pte_t *pte; int err = -ENOMEM; + spin_lock(&init_mm.page_table_lock); + /* Use upper 10 bits of VA to index the first level map */ pge = pgd_offset_k(va); pue = pud_offset(pge, va); pme = pmd_offset(pue, va); /* Use middle 10 bits of VA to index the second-level map */ - pte = pte_alloc_kernel(pme, va); + pte = pte_alloc_kernel(&init_mm, pme, va); if (pte != 0) { err = 0; set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot)); } + spin_unlock(&init_mm.page_table_lock); return err; } diff --git a/trunk/arch/frv/mm/pgalloc.c b/trunk/arch/frv/mm/pgalloc.c index 2c67dfe5a6b3..4eaec0f3525b 100644 --- a/trunk/arch/frv/mm/pgalloc.c +++ b/trunk/arch/frv/mm/pgalloc.c @@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd) if (pgd_list) pgd_list->private = (unsigned long) &page->index; pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); + page->private = (unsigned long) &pgd_list; } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *) page->index; - pprev = (struct page **)page_private(page); + pprev = (struct page **) page->private; *pprev = next; if (next) next->private = (unsigned long) pprev; diff --git a/trunk/arch/i386/kernel/vm86.c b/trunk/arch/i386/kernel/vm86.c index fc1993564f98..16b485009622 100644 --- a/trunk/arch/i386/kernel/vm86.c +++ b/trunk/arch/i386/kernel/vm86.c @@ -134,16 +134,17 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) return ret; } -static void mark_screen_rdonly(struct mm_struct *mm) +static void mark_screen_rdonly(struct task_struct * tsk) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; + pte_t *pte, *mapped; int i; - pgd = pgd_offset(mm, 0xA0000); + preempt_disable(); + spin_lock(&tsk->mm->page_table_lock); + pgd = pgd_offset(tsk->mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; pud = pud_offset(pgd, 0xA0000); @@ -152,14 +153,16 @@ static void mark_screen_rdonly(struct mm_struct *mm) pmd = pmd_offset(pud, 0xA0000); if (pmd_none_or_clear_bad(pmd)) goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); + pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) set_pte(pte, pte_wrprotect(*pte)); pte++; } - pte_unmap_unlock(pte, ptl); + pte_unmap(mapped); out: + spin_unlock(&tsk->mm->page_table_lock); + preempt_enable(); flush_tlb(); } @@ -303,7 +306,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); + mark_screen_rdonly(tsk); __asm__ __volatile__( "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" "movl %0,%%esp\n\t" diff --git a/trunk/arch/i386/mm/discontig.c b/trunk/arch/i386/mm/discontig.c index c4af9638dbfa..244d8ec66be2 100644 --- a/trunk/arch/i386/mm/discontig.c +++ b/trunk/arch/i386/mm/discontig.c @@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); +extern void one_highpage_init(struct page *, int, int); extern struct e820map e820; extern unsigned long init_pg_tables_end; @@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro) if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); + one_highpage_init(page, node_pfn, bad_ppro); } } totalram_pages += totalhigh_pages; diff --git a/trunk/arch/i386/mm/init.c b/trunk/arch/i386/mm/init.c index 542d9298da5e..2ebaf75f732e 100644 --- a/trunk/arch/i386/mm/init.c +++ b/trunk/arch/i386/mm/init.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -267,46 +266,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __devinit free_new_highpage(struct page *page) -{ - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; -} - -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - free_new_highpage(page); + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; } else SetPageReserved(page); } -static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) -{ - free_new_highpage(page); - totalram_pages++; -#ifdef CONFIG_FLATMEM - max_mapnr = max(pfn, max_mapnr); -#endif - num_physpages++; - return 0; -} - -/* - * Not currently handling the NUMA case. - * Assuming single node and all memory that - * has been added dynamically that would be - * onlined here is in HIGHMEM - */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - add_one_highpage_hotplug(page, page_to_pfn(page)); -} - - #ifdef CONFIG_NUMA extern void set_highmem_pages_init(int); #else @@ -314,7 +284,7 @@ static void __init set_highmem_pages_init(int bad_ppro) { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -645,28 +615,6 @@ void __init mem_init(void) #endif } -/* - * this is for the non-NUMA, single node SMP system case. - * Specifically, in the case of x86, we will always add - * memory to the highmem for now. - */ -#ifndef CONFIG_NEED_MULTIPLE_NODES -int add_memory(u64 start, u64 size) -{ - struct pglist_data *pgdata = &contig_page_data; - struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - return __add_pages(zone, start_pfn, nr_pages); -} - -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -#endif - kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; diff --git a/trunk/arch/i386/mm/ioremap.c b/trunk/arch/i386/mm/ioremap.c index 5d09de8d1c6b..f379b8d67558 100644 --- a/trunk/arch/i386/mm/ioremap.c +++ b/trunk/arch/i386/mm/ioremap.c @@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long pfn; pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel(&init_mm, pmd, addr); if (!pte) return -ENOMEM; do { @@ -87,12 +87,14 @@ static int ioremap_page_range(unsigned long addr, flush_cache_all(); phys_addr -= addr; pgd = pgd_offset_k(addr); + spin_lock(&init_mm.page_table_lock); do { next = pgd_addr_end(addr, end); err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); if (err) break; } while (pgd++, addr = next, addr != end); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return err; } diff --git a/trunk/arch/i386/mm/pgtable.c b/trunk/arch/i386/mm/pgtable.c index 9db3242103be..dcdce2c6c532 100644 --- a/trunk/arch/i386/mm/pgtable.c +++ b/trunk/arch/i386/mm/pgtable.c @@ -31,13 +31,11 @@ void show_mem(void) pg_data_t *pgdat; unsigned long i; struct page_state ps; - unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -50,7 +48,6 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } - pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); @@ -191,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd) struct page *page = virt_to_page(pgd); page->index = (unsigned long)pgd_list; if (pgd_list) - set_page_private(pgd_list, (unsigned long)&page->index); + pgd_list->private = (unsigned long)&page->index; pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); + page->private = (unsigned long)&pgd_list; } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *)page->index; - pprev = (struct page **)page_private(page); + pprev = (struct page **)page->private; *pprev = next; if (next) - set_page_private(next, (unsigned long)pprev); + next->private = (unsigned long)pprev; } void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) diff --git a/trunk/arch/i386/oprofile/backtrace.c b/trunk/arch/i386/oprofile/backtrace.c index 21654be3f73f..65dfd2edb671 100644 --- a/trunk/arch/i386/oprofile/backtrace.c +++ b/trunk/arch/i386/oprofile/backtrace.c @@ -12,7 +12,6 @@ #include #include #include -#include struct frame_head { struct frame_head * ebp; @@ -22,22 +21,26 @@ struct frame_head { static struct frame_head * dump_backtrace(struct frame_head * head) { - struct frame_head bufhead[2]; - - /* Also check accessibility of one struct frame_head beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) - return NULL; - - oprofile_add_trace(bufhead[0].ret); + oprofile_add_trace(head->ret); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= bufhead[0].ebp) + if (head >= head->ebp) return NULL; - return bufhead[0].ebp; + return head->ebp; +} + +/* check that the page(s) containing the frame head are present */ +static int pages_present(struct frame_head * head) +{ + struct mm_struct * mm = current->mm; + + /* FIXME: only necessary once per page */ + if (!check_user_page_readable(mm, (unsigned long)head)) + return 0; + + return check_user_page_readable(mm, (unsigned long)(head + 1)); } /* @@ -94,6 +97,15 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) return; } - while (depth-- && head) +#ifdef CONFIG_SMP + if (!spin_trylock(¤t->mm->page_table_lock)) + return; +#endif + + while (depth-- && head && pages_present(head)) head = dump_backtrace(head); + +#ifdef CONFIG_SMP + spin_unlock(¤t->mm->page_table_lock); +#endif } diff --git a/trunk/arch/ia64/kernel/perfmon.c b/trunk/arch/ia64/kernel/perfmon.c index f7dfc107cb7b..d71731ee5b61 100644 --- a/trunk/arch/ia64/kernel/perfmon.c +++ b/trunk/arch/ia64/kernel/perfmon.c @@ -2352,8 +2352,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon insert_vm_struct(mm, vma); mm->total_vm += size >> PAGE_SHIFT; - vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); + vm_stat_account(vma); up_write(&task->mm->mmap_sem); /* diff --git a/trunk/arch/ia64/mm/discontig.c b/trunk/arch/ia64/mm/discontig.c index a88cdb7232f8..a3788fb84809 100644 --- a/trunk/arch/ia64/mm/discontig.c +++ b/trunk/arch/ia64/mm/discontig.c @@ -555,13 +555,9 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present; - unsigned long flags; + unsigned long present = pgdat->node_present_pages; int shared = 0, cached = 0, reserved = 0; - printk("Node ID: %d\n", pgdat->node_id); - pgdat_resize_lock(pgdat, &flags); - present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { struct page *page; if (pfn_valid(pgdat->node_start_pfn + i)) @@ -575,7 +571,6 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page)-1; } - pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; diff --git a/trunk/arch/ia64/mm/fault.c b/trunk/arch/ia64/mm/fault.c index af7eb087dca7..3c32af910d60 100644 --- a/trunk/arch/ia64/mm/fault.c +++ b/trunk/arch/ia64/mm/fault.c @@ -19,6 +19,32 @@ extern void die (char *, struct pt_regs *, long); +/* + * This routine is analogous to expand_stack() but instead grows the + * register backing store (which grows towards higher addresses). + * Since the register backing store is access sequentially, we + * disallow growing the RBS by more than a page at a time. Note that + * the VM_GROWSUP flag can be set on any VM area but that's fine + * because the total process size is still limited by RLIMIT_STACK and + * RLIMIT_AS. + */ +static inline long +expand_backing_store (struct vm_area_struct *vma, unsigned long address) +{ + unsigned long grow; + + grow = PAGE_SIZE >> PAGE_SHIFT; + if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur + || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur)) + return -ENOMEM; + vma->vm_end += PAGE_SIZE; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); + return 0; +} + /* * Return TRUE if ADDRESS points at a page in the kernel's mapped segment * (inside region 5, on ia64) and that page is present. @@ -159,13 +185,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) || REGION_OFFSET(address) >= RGN_MAP_LIMIT) goto bad_area; - /* - * Since the register backing store is accessed sequentially, - * we disallow growing it by more than a page at a time. - */ - if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) - goto bad_area; - if (expand_upwards(vma, address)) + if (expand_backing_store(vma, address)) goto bad_area; } goto good_area; diff --git a/trunk/arch/ia64/mm/init.c b/trunk/arch/ia64/mm/init.c index e3215ba64ffd..98246acd4991 100644 --- a/trunk/arch/ia64/mm/init.c +++ b/trunk/arch/ia64/mm/init.c @@ -158,7 +158,7 @@ ia64_init_addr_space (void) vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; + vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP; down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); @@ -275,21 +275,26 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ + spin_lock(&init_mm.page_table_lock); { pud = pud_alloc(&init_mm, pgd, address); if (!pud) goto out; + pmd = pmd_alloc(&init_mm, pud, address); if (!pmd) goto out; - pte = pte_alloc_kernel(pmd, address); + pte = pte_alloc_map(&init_mm, pmd, address); if (!pte) goto out; - if (!pte_none(*pte)) + if (!pte_none(*pte)) { + pte_unmap(pte); goto out; + } set_pte(pte, mk_pte(page, pgprot)); + pte_unmap(pte); } - out: + out: spin_unlock(&init_mm.page_table_lock); /* no need for flush_tlb */ return page; } diff --git a/trunk/arch/ia64/mm/tlb.c b/trunk/arch/ia64/mm/tlb.c index c79a9b96d02b..c93e0f2b5fea 100644 --- a/trunk/arch/ia64/mm/tlb.c +++ b/trunk/arch/ia64/mm/tlb.c @@ -158,12 +158,10 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long # ifdef CONFIG_SMP platform_global_tlb_purge(mm, start, end, nbits); # else - preempt_disable(); do { ia64_ptcl(start, (nbits<<2)); start += (1UL << nbits); } while (start < end); - preempt_enable(); # endif ia64_srlz_i(); /* srlz.i implies srlz.d */ diff --git a/trunk/arch/m32r/mm/init.c b/trunk/arch/m32r/mm/init.c index 6facf15b04f3..d9a40b1fe8ba 100644 --- a/trunk/arch/m32r/mm/init.c +++ b/trunk/arch/m32r/mm/init.c @@ -48,8 +48,6 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long flags; - pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -62,7 +60,6 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } - pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -153,14 +150,10 @@ int __init reservedpages_count(void) int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) { - unsigned long flags; - pgdat_resize_lock(NODE_DATA(nid), &flags); + for_each_online_node(nid) for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; - pgdat_resize_unlock(NODE_DATA(nid), &flags); - } return reservedpages; } diff --git a/trunk/arch/m32r/mm/ioremap.c b/trunk/arch/m32r/mm/ioremap.c index a151849a605e..70c59055c19c 100644 --- a/trunk/arch/m32r/mm/ioremap.c +++ b/trunk/arch/m32r/mm/ioremap.c @@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -90,6 +90,7 @@ remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -103,6 +104,7 @@ remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/trunk/arch/m68k/Kconfig b/trunk/arch/m68k/Kconfig index 1dd5d18b2201..ba960bbc8e6d 100644 --- a/trunk/arch/m68k/Kconfig +++ b/trunk/arch/m68k/Kconfig @@ -388,11 +388,33 @@ config AMIGA_PCMCIA Include support in the kernel for pcmcia on Amiga 1200 and Amiga 600. If you intend to use pcmcia cards say Y; otherwise say N. +config STRAM_SWAP + bool "Support for ST-RAM as swap space" + depends on ATARI && BROKEN + ---help--- + Some Atari 68k machines (including the 520STF and 1020STE) divide + their addressable memory into ST and TT sections. The TT section + (up to 512MB) is the main memory; the ST section (up to 4MB) is + accessible to the built-in graphics board, runs slower, and is + present mainly for backward compatibility with older machines. + + This enables support for using (parts of) ST-RAM as swap space, + instead of as normal system memory. This can first enhance system + performance if you have lots of alternate RAM (compared to the size + of ST-RAM), because executable code always will reside in faster + memory. ST-RAM will remain as ultra-fast swap space. On the other + hand, it allows much improved dynamic allocations of ST-RAM buffers + for device driver modules (e.g. floppy, ACSI, SLM printer, DMA + sound). The probability that such allocations at module load time + fail is drastically reduced. + config STRAM_PROC bool "ST-RAM statistics in /proc" depends on ATARI help - Say Y here to report ST-RAM usage statistics in /proc/stram. + Say Y here to report ST-RAM usage statistics in /proc/stram. See + the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its + uses. config HEARTBEAT bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40 diff --git a/trunk/arch/m68k/atari/stram.c b/trunk/arch/m68k/atari/stram.c index 22e0481a5f7b..5a3c106b40c8 100644 --- a/trunk/arch/m68k/atari/stram.c +++ b/trunk/arch/m68k/atari/stram.c @@ -15,9 +15,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -31,6 +33,8 @@ #include #include +#include + #undef DEBUG #ifdef DEBUG @@ -45,7 +49,8 @@ #include #endif -/* +/* Pre-swapping comments: + * * ++roman: * * New version of ST-Ram buffer allocation. Instead of using the @@ -70,6 +75,76 @@ * */ +/* + * New Nov 1997: Use ST-RAM as swap space! + * + * In the past, there were often problems with modules that require ST-RAM + * buffers. Such drivers have to use __get_dma_pages(), which unfortunately + * often isn't very successful in allocating more than 1 page :-( [1] The net + * result was that most of the time you couldn't insmod such modules (ataflop, + * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm, + * which needs a 1 MB buffer... :-). + * + * To overcome this limitation, ST-RAM can now be turned into a very + * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel + * now tries to unswap some pages on that swap device to make some free (and + * contiguous) space. This works much better in comparison to + * __get_dma_pages(), since used swap pages can be selectively freed by either + * moving them to somewhere else in swap space, or by reading them back into + * system memory. Ok, there operation of unswapping isn't really cheap (for + * each page, one has to go through the page tables of all processes), but it + * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a + * module that needs ST-RAM). But it at least makes it possible to load such + * modules! + * + * It could also be that overall system performance increases a bit due to + * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or + * executing code in. It's then just a (very fast, compared to disk) back + * storage for not-so-often needed data. (But this effect must be compared + * with the loss of total memory...) Don't know if the effect is already + * visible on a TT, where the speed difference between ST- and TT-RAM isn't + * that dramatic, but it should on machines where TT-RAM is really much faster + * (e.g. Afterburner). + * + * [1]: __get_free_pages() does a fine job if you only want one page, but if + * you want more (contiguous) pages, it can give you such a block only if + * there's already a free one. The algorithm can't try to free buffers or swap + * out something in order to make more free space, since all that page-freeing + * mechanisms work "target-less", i.e. they just free something, but not in a + * specific place. I.e., __get_free_pages() can't do anything to free + * *adjacent* pages :-( This situation becomes even worse for DMA memory, + * since the freeing algorithms are also blind to DMA capability of pages. + */ + +/* 1998-10-20: ++andreas + unswap_by_move disabled because it does not handle swapped shm pages. +*/ + +/* 2000-05-01: ++andreas + Integrated with bootmem. Remove all traces of unswap_by_move. +*/ + +#ifdef CONFIG_STRAM_SWAP +#define ALIGN_IF_SWAP(x) PAGE_ALIGN(x) +#else +#define ALIGN_IF_SWAP(x) (x) +#endif + +/* get index of swap page at address 'addr' */ +#define SWAP_NR(addr) (((addr) - swap_start) >> PAGE_SHIFT) + +/* get address of swap page #'nr' */ +#define SWAP_ADDR(nr) (swap_start + ((nr) << PAGE_SHIFT)) + +/* get number of pages for 'n' bytes (already page-aligned) */ +#define N_PAGES(n) ((n) >> PAGE_SHIFT) + +/* The following two numbers define the maximum fraction of ST-RAM in total + * memory, below that the kernel would automatically use ST-RAM as swap + * space. This decision can be overridden with stram_swap= */ +#define MAX_STRAM_FRACTION_NOM 1 +#define MAX_STRAM_FRACTION_DENOM 3 + /* Start and end (virtual) of ST-RAM */ static void *stram_start, *stram_end; @@ -89,9 +164,10 @@ typedef struct stram_block { } BLOCK; /* values for flags field */ -#define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ +#define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ #define BLOCK_KMALLOCED 0x02 /* structure allocated by kmalloc() */ -#define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ +#define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ +#define BLOCK_INSWAP 0x10 /* block allocated in swap space */ /* list of allocated blocks */ static BLOCK *alloc_list; @@ -103,8 +179,60 @@ static BLOCK *alloc_list; #define N_STATIC_BLOCKS 20 static BLOCK static_blocks[N_STATIC_BLOCKS]; +#ifdef CONFIG_STRAM_SWAP +/* max. number of bytes to use for swapping + * 0 = no ST-RAM swapping + * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of + * total memory + */ +static int max_swap_size = -1; + +/* start and end of swapping area */ +static void *swap_start, *swap_end; + +/* The ST-RAM's swap info structure */ +static struct swap_info_struct *stram_swap_info; + +/* The ST-RAM's swap type */ +static int stram_swap_type; + +/* Semaphore for get_stram_region. */ +static DECLARE_MUTEX(stram_swap_sem); + +/* major and minor device number of the ST-RAM device; for the major, we use + * the same as Amiga z2ram, which is really similar and impossible on Atari, + * and for the minor a relatively odd number to avoid the user creating and + * using that device. */ +#define STRAM_MAJOR Z2RAM_MAJOR +#define STRAM_MINOR 13 + +/* Some impossible pointer value */ +#define MAGIC_FILE_P (struct file *)0xffffdead + +#ifdef DO_PROC +static unsigned stat_swap_read; +static unsigned stat_swap_write; +static unsigned stat_swap_force; +#endif /* DO_PROC */ + +#endif /* CONFIG_STRAM_SWAP */ + /***************************** Prototypes *****************************/ +#ifdef CONFIG_STRAM_SWAP +static int swap_init(void *start_mem, void *swap_data); +static void *get_stram_region( unsigned long n_pages ); +static void free_stram_region( unsigned long offset, unsigned long n_pages + ); +static int in_some_region(void *addr); +static unsigned long find_free_region( unsigned long n_pages, unsigned long + *total_free, unsigned long + *region_free ); +static void do_stram_request(request_queue_t *); +static int stram_open( struct inode *inode, struct file *filp ); +static int stram_release( struct inode *inode, struct file *filp ); +static void reserve_region(void *start, void *end); +#endif static BLOCK *add_region( void *addr, unsigned long size ); static BLOCK *find_region( void *addr ); static int remove_region( BLOCK *block ); @@ -151,11 +279,84 @@ void __init atari_stram_init(void) */ void __init atari_stram_reserve_pages(void *start_mem) { +#ifdef CONFIG_STRAM_SWAP + /* if max_swap_size is negative (i.e. no stram_swap= option given), + * determine at run time whether to use ST-RAM swapping */ + if (max_swap_size < 0) + /* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION + * of total memory. In that case, the max. size is set to 16 MB, + * because ST-RAM can never be bigger than that. + * Also, never use swapping on a Hades, there's no separate ST-RAM in + * that machine. */ + max_swap_size = + (!MACH_IS_HADES && + (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <= + ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0; + DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size ); +#endif + /* always reserve first page of ST-RAM, the first 2 kB are * supervisor-only! */ if (!kernel_in_stram) reserve_bootmem (0, PAGE_SIZE); +#ifdef CONFIG_STRAM_SWAP + { + void *swap_data; + + start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem); + /* determine first page to use as swap: if the kernel is + in TT-RAM, this is the first page of (usable) ST-RAM; + otherwise just use the end of kernel data (= start_mem) */ + swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem; + /* decrement by one page, rest of kernel assumes that first swap page + * is always reserved and maybe doesn't handle swp_entry == 0 + * correctly */ + swap_start -= PAGE_SIZE; + swap_end = stram_end; + if (swap_end-swap_start > max_swap_size) + swap_end = swap_start + max_swap_size; + DPRINTK( "atari_stram_reserve_pages: swapping enabled; " + "swap=%p-%p\n", swap_start, swap_end); + + /* reserve some amount of memory for maintainance of + * swapping itself: one page for each 2048 (PAGE_SIZE/2) + * swap pages. (2 bytes for each page) */ + swap_data = start_mem; + start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1) + >> (PAGE_SHIFT-1)) << PAGE_SHIFT; + /* correct swap_start if necessary */ + if (swap_start + PAGE_SIZE == swap_data) + swap_start = start_mem - PAGE_SIZE; + + if (!swap_init( start_mem, swap_data )) { + printk( KERN_ERR "ST-RAM swap space initialization failed\n" ); + max_swap_size = 0; + return; + } + /* reserve region for swapping meta-data */ + reserve_region(swap_data, start_mem); + /* reserve swapping area itself */ + reserve_region(swap_start + PAGE_SIZE, swap_end); + + /* + * If the whole ST-RAM is used for swapping, there are no allocatable + * dma pages left. But unfortunately, some shared parts of the kernel + * (particularly the SCSI mid-level) call __get_dma_pages() + * unconditionally :-( These calls then fail, and scsi.c even doesn't + * check for NULL return values and just crashes. The quick fix for + * this (instead of doing much clean up work in the SCSI code) is to + * pretend all pages are DMA-able by setting mach_max_dma_address to + * ULONG_MAX. This doesn't change any functionality so far, since + * get_dma_pages() shouldn't be used on Atari anyway anymore (better + * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA + * memory. But unfortunately there's now no kind of warning (even not + * a NULL return value) if you use get_dma_pages() nevertheless :-( + * You just will get non-DMA-able memory... + */ + mach_max_dma_address = 0xffffffff; + } +#endif } void atari_stram_mem_init_hook (void) @@ -166,6 +367,7 @@ void atari_stram_mem_init_hook (void) /* * This is main public interface: somehow allocate a ST-RAM block + * There are three strategies: * * - If we're before mem_init(), we have to make a static allocation. The * region is taken in the kernel data area (if the kernel is in ST-RAM) or @@ -173,9 +375,14 @@ void atari_stram_mem_init_hook (void) * rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel * address space in the latter case. * - * - If mem_init() already has been called, try with __get_dma_pages(). - * This has the disadvantage that it's very hard to get more than 1 page, - * and it is likely to fail :-( + * - If mem_init() already has been called and ST-RAM swapping is enabled, + * try to get the memory from the (pseudo) swap-space, either free already + * or by moving some other pages out of the swap. + * + * - If mem_init() already has been called, and ST-RAM swapping is not + * enabled, the only possibility is to try with __get_dma_pages(). This has + * the disadvantage that it's very hard to get more than 1 page, and it is + * likely to fail :-( * */ void *atari_stram_alloc(long size, const char *owner) @@ -186,13 +393,27 @@ void *atari_stram_alloc(long size, const char *owner) DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner); + size = ALIGN_IF_SWAP(size); + DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size ); +#ifdef CONFIG_STRAM_SWAP + if (max_swap_size) { + /* If swapping is active: make some free space in the swap + "device". */ + DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, " + "calling get_region\n" ); + addr = get_stram_region( N_PAGES(size) ); + flags = BLOCK_INSWAP; + } + else +#endif if (!mem_init_done) return alloc_bootmem_low(size); else { - /* After mem_init(): can only resort to __get_dma_pages() */ + /* After mem_init() and no swapping: can only resort to + * __get_dma_pages() */ addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size)); flags = BLOCK_GFP; - DPRINTK( "atari_stram_alloc: after mem_init, " + DPRINTK( "atari_stram_alloc: after mem_init, swapping off, " "get_pages=%p\n", addr ); } @@ -201,7 +422,12 @@ void *atari_stram_alloc(long size, const char *owner) /* out of memory for BLOCK structure :-( */ DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- " "freeing again\n" ); - free_pages((unsigned long)addr, get_order(size)); +#ifdef CONFIG_STRAM_SWAP + if (flags == BLOCK_INSWAP) + free_stram_region( SWAP_NR(addr), N_PAGES(size) ); + else +#endif + free_pages((unsigned long)addr, get_order(size)); return( NULL ); } block->owner = owner; @@ -225,12 +451,25 @@ void atari_stram_free( void *addr ) DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, " "flags=%02x\n", block, block->size, block->owner, block->flags ); - if (!(block->flags & BLOCK_GFP)) +#ifdef CONFIG_STRAM_SWAP + if (!max_swap_size) { +#endif + if (block->flags & BLOCK_GFP) { + DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n", + get_order(block->size)); + free_pages((unsigned long)addr, get_order(block->size)); + } + else + goto fail; +#ifdef CONFIG_STRAM_SWAP + } + else if (block->flags & BLOCK_INSWAP) { + DPRINTK( "atari_stram_free: is swap-alloced\n" ); + free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) ); + } + else goto fail; - - DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n", - get_order(block->size)); - free_pages((unsigned long)addr, get_order(block->size)); +#endif remove_region( block ); return; @@ -239,6 +478,612 @@ void atari_stram_free( void *addr ) "(called from %p)\n", addr, __builtin_return_address(0) ); } + +#ifdef CONFIG_STRAM_SWAP + + +/* ------------------------------------------------------------------------ */ +/* Main Swapping Functions */ +/* ------------------------------------------------------------------------ */ + + +/* + * Initialize ST-RAM swap device + * (lots copied and modified from sys_swapon() in mm/swapfile.c) + */ +static int __init swap_init(void *start_mem, void *swap_data) +{ + static struct dentry fake_dentry; + static struct vfsmount fake_vfsmnt; + struct swap_info_struct *p; + struct inode swap_inode; + unsigned int type; + void *addr; + int i, j, k, prev; + + DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n", + start_mem, swap_data); + + /* need at least one page for swapping to (and this also isn't very + * much... :-) */ + if (swap_end - swap_start < 2*PAGE_SIZE) { + printk( KERN_WARNING "stram_swap_init: swap space too small\n" ); + return( 0 ); + } + + /* find free slot in swap_info */ + for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ ) + if (!(p->flags & SWP_USED)) + break; + if (type >= MAX_SWAPFILES) { + printk( KERN_WARNING "stram_swap_init: max. number of " + "swap devices exhausted\n" ); + return( 0 ); + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + + stram_swap_info = p; + stram_swap_type = type; + + /* fake some dir cache entries to give us some name in /dev/swaps */ + fake_dentry.d_parent = &fake_dentry; + fake_dentry.d_name.name = "stram (internal)"; + fake_dentry.d_name.len = 16; + fake_vfsmnt.mnt_parent = &fake_vfsmnt; + + p->flags = SWP_USED; + p->swap_file = &fake_dentry; + p->swap_vfsmnt = &fake_vfsmnt; + p->swap_map = swap_data; + p->cluster_nr = 0; + p->next = -1; + p->prio = 0x7ff0; /* a rather high priority, but not the higest + * to give the user a chance to override */ + + /* call stram_open() directly, avoids at least the overhead in + * constructing a dummy file structure... */ + swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR ); + stram_open( &swap_inode, MAGIC_FILE_P ); + p->max = SWAP_NR(swap_end); + + /* initialize swap_map: set regions that are already allocated or belong + * to kernel data space to SWAP_MAP_BAD, otherwise to free */ + j = 0; /* # of free pages */ + k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */ + p->lowest_bit = 0; + p->highest_bit = 0; + for( i = 1, addr = SWAP_ADDR(1); i < p->max; + i++, addr += PAGE_SIZE ) { + if (in_some_region( addr )) { + p->swap_map[i] = SWAP_MAP_BAD; + ++k; + } + else if (kernel_in_stram && addr < start_mem ) { + p->swap_map[i] = SWAP_MAP_BAD; + } + else { + p->swap_map[i] = 0; + ++j; + if (!p->lowest_bit) p->lowest_bit = i; + p->highest_bit = i; + } + } + /* first page always reserved (and doesn't really belong to swap space) */ + p->swap_map[0] = SWAP_MAP_BAD; + + /* now swapping to this device ok */ + p->pages = j + k; + swap_list_lock(); + nr_swap_pages += j; + p->flags = SWP_WRITEOK; + + /* insert swap space into swap_list */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + swap_list_unlock(); + + printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n", + p->pages << 2, p->pages ); + return( 1 ); +} + + +/* + * The swap entry has been read in advance, and we return 1 to indicate + * that the page has been used or is no longer needed. + * + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + */ +static inline void unswap_pte(struct vm_area_struct * vma, unsigned long + address, pte_t *dir, swp_entry_t entry, + struct page *page) +{ + pte_t pte = *dir; + + if (pte_none(pte)) + return; + if (pte_present(pte)) { + /* If this entry is swap-cached, then page must already + hold the right address for any copies in physical + memory */ + if (pte_page(pte) != page) + return; + /* We will be removing the swap cache in a moment, so... */ + set_pte(dir, pte_mkdirty(pte)); + return; + } + if (pte_val(pte) != entry.val) + return; + + DPRINTK("unswap_pte: replacing entry %08lx by new page %p", + entry.val, page); + set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + swap_free(entry); + get_page(page); + inc_mm_counter(vma->vm_mm, rss); +} + +static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, + unsigned long offset, swp_entry_t entry, + struct page *page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset_kernel(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page); + address += PAGE_SIZE; + pte++; + } while (address < end); +} + +static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + swp_entry_t entry, struct page *page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + unswap_pmd(vma, pmd, address, end - address, offset, entry, + page); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page *page) +{ + unsigned long start = vma->vm_start, end = vma->vm_end; + + do { + unswap_pgd(vma, pgdir, start, end - start, entry, page); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (start < end); +} + +static void unswap_process(struct mm_struct * mm, swp_entry_t entry, + struct page *page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + if (!mm) + return; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + unswap_vma(vma, pgd, entry, page); + } +} + + +static int unswap_by_read(unsigned short *map, unsigned long max, + unsigned long start, unsigned long n_pages) +{ + struct task_struct *p; + struct page *page; + swp_entry_t entry; + unsigned long i; + + DPRINTK( "unswapping %lu..%lu by reading in\n", + start, start+n_pages-1 ); + + for( i = start; i < start+n_pages; ++i ) { + if (map[i] == SWAP_MAP_BAD) { + printk( KERN_ERR "get_stram_region: page %lu already " + "reserved??\n", i ); + continue; + } + + if (map[i]) { + entry = swp_entry(stram_swap_type, i); + DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n", + i, map[i], nr_swap_pages); + + swap_device_lock(stram_swap_info); + map[i]++; + swap_device_unlock(stram_swap_info); + /* Get a page for the entry, using the existing + swap cache page if there is one. Otherwise, + get a clean page and read the swap into it. */ + page = read_swap_cache_async(entry, NULL, 0); + if (!page) { + swap_free(entry); + return -ENOMEM; + } + read_lock(&tasklist_lock); + for_each_process(p) + unswap_process(p->mm, entry, page); + read_unlock(&tasklist_lock); + shmem_unuse(entry, page); + /* Now get rid of the extra reference to the + temporary page we've been using. */ + if (PageSwapCache(page)) + delete_from_swap_cache(page); + __free_page(page); + #ifdef DO_PROC + stat_swap_force++; + #endif + } + + DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n", + i, map[i], nr_swap_pages ); + swap_list_lock(); + swap_device_lock(stram_swap_info); + map[i] = SWAP_MAP_BAD; + if (stram_swap_info->lowest_bit == i) + stram_swap_info->lowest_bit++; + if (stram_swap_info->highest_bit == i) + stram_swap_info->highest_bit--; + --nr_swap_pages; + swap_device_unlock(stram_swap_info); + swap_list_unlock(); + } + + return 0; +} + +/* + * reserve a region in ST-RAM swap space for an allocation + */ +static void *get_stram_region( unsigned long n_pages ) +{ + unsigned short *map = stram_swap_info->swap_map; + unsigned long max = stram_swap_info->max; + unsigned long start, total_free, region_free; + int err; + void *ret = NULL; + + DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages ); + + down(&stram_swap_sem); + + /* disallow writing to the swap device now */ + stram_swap_info->flags = SWP_USED; + + /* find a region of n_pages pages in the swap space including as much free + * pages as possible (and excluding any already-reserved pages). */ + if (!(start = find_free_region( n_pages, &total_free, ®ion_free ))) + goto end; + DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n", + start, region_free ); + + err = unswap_by_read(map, max, start, n_pages); + if (err) + goto end; + + ret = SWAP_ADDR(start); + end: + /* allow using swap device again */ + stram_swap_info->flags = SWP_WRITEOK; + up(&stram_swap_sem); + DPRINTK( "get_stram_region: returning %p\n", ret ); + return( ret ); +} + + +/* + * free a reserved region in ST-RAM swap space + */ +static void free_stram_region( unsigned long offset, unsigned long n_pages ) +{ + unsigned short *map = stram_swap_info->swap_map; + + DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages ); + + if (offset < 1 || offset + n_pages > stram_swap_info->max) { + printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" ); + return; + } + + swap_list_lock(); + swap_device_lock(stram_swap_info); + /* un-reserve the freed pages */ + for( ; n_pages > 0; ++offset, --n_pages ) { + if (map[offset] != SWAP_MAP_BAD) + printk( KERN_ERR "free_stram_region: Swap page %lu was not " + "reserved\n", offset ); + map[offset] = 0; + } + + /* update swapping meta-data */ + if (offset < stram_swap_info->lowest_bit) + stram_swap_info->lowest_bit = offset; + if (offset+n_pages-1 > stram_swap_info->highest_bit) + stram_swap_info->highest_bit = offset+n_pages-1; + if (stram_swap_info->prio > swap_info[swap_list.next].prio) + swap_list.next = swap_list.head; + nr_swap_pages += n_pages; + swap_device_unlock(stram_swap_info); + swap_list_unlock(); +} + + +/* ------------------------------------------------------------------------ */ +/* Utility Functions for Swapping */ +/* ------------------------------------------------------------------------ */ + + +/* is addr in some of the allocated regions? */ +static int in_some_region(void *addr) +{ + BLOCK *p; + + for( p = alloc_list; p; p = p->next ) { + if (p->start <= addr && addr < p->start + p->size) + return( 1 ); + } + return( 0 ); +} + + +static unsigned long find_free_region(unsigned long n_pages, + unsigned long *total_free, + unsigned long *region_free) +{ + unsigned short *map = stram_swap_info->swap_map; + unsigned long max = stram_swap_info->max; + unsigned long head, tail, max_start; + long nfree, max_free; + + /* first scan the swap space for a suitable place for the allocation */ + head = 1; + max_start = 0; + max_free = -1; + *total_free = 0; + + start_over: + /* increment tail until final window size reached, and count free pages */ + nfree = 0; + for( tail = head; tail-head < n_pages && tail < max; ++tail ) { + if (map[tail] == SWAP_MAP_BAD) { + head = tail+1; + goto start_over; + } + if (!map[tail]) { + ++nfree; + ++*total_free; + } + } + if (tail-head < n_pages) + goto out; + if (nfree > max_free) { + max_start = head; + max_free = nfree; + if (max_free >= n_pages) + /* don't need more free pages... :-) */ + goto out; + } + + /* now shift the window and look for the area where as much pages as + * possible are free */ + while( tail < max ) { + nfree -= (map[head++] == 0); + if (map[tail] == SWAP_MAP_BAD) { + head = tail+1; + goto start_over; + } + if (!map[tail]) { + ++nfree; + ++*total_free; + } + ++tail; + if (nfree > max_free) { + max_start = head; + max_free = nfree; + if (max_free >= n_pages) + /* don't need more free pages... :-) */ + goto out; + } + } + + out: + if (max_free < 0) { + printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented " + "-- can't allocate %lu pages\n", n_pages ); + return( 0 ); + } + + *region_free = max_free; + return( max_start ); +} + + +/* setup parameters from command line */ +void __init stram_swap_setup(char *str, int *ints) +{ + if (ints[0] >= 1) + max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK; +} + + +/* ------------------------------------------------------------------------ */ +/* ST-RAM device */ +/* ------------------------------------------------------------------------ */ + +static int refcnt; + +static void do_stram_request(request_queue_t *q) +{ + struct request *req; + + while ((req = elv_next_request(q)) != NULL) { + void *start = swap_start + (req->sector << 9); + unsigned long len = req->current_nr_sectors << 9; + if ((start + len) > swap_end) { + printk( KERN_ERR "stram: bad access beyond end of device: " + "block=%ld, count=%d\n", + req->sector, + req->current_nr_sectors ); + end_request(req, 0); + continue; + } + + if (req->cmd == READ) { + memcpy(req->buffer, start, len); +#ifdef DO_PROC + stat_swap_read += N_PAGES(len); +#endif + } + else { + memcpy(start, req->buffer, len); +#ifdef DO_PROC + stat_swap_write += N_PAGES(len); +#endif + } + end_request(req, 1); + } +} + + +static int stram_open( struct inode *inode, struct file *filp ) +{ + if (filp != MAGIC_FILE_P) { + printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" ); + return( -EPERM ); + } + if (refcnt) + return( -EBUSY ); + ++refcnt; + return( 0 ); +} + +static int stram_release( struct inode *inode, struct file *filp ) +{ + if (filp != MAGIC_FILE_P) { + printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" ); + return( -EPERM ); + } + if (refcnt > 0) + --refcnt; + return( 0 ); +} + + +static struct block_device_operations stram_fops = { + .open = stram_open, + .release = stram_release, +}; + +static struct gendisk *stram_disk; +static struct request_queue *stram_queue; +static DEFINE_SPINLOCK(stram_lock); + +int __init stram_device_init(void) +{ + if (!MACH_IS_ATARI) + /* no point in initializing this, I hope */ + return -ENXIO; + + if (!max_swap_size) + /* swapping not enabled */ + return -ENXIO; + stram_disk = alloc_disk(1); + if (!stram_disk) + return -ENOMEM; + + if (register_blkdev(STRAM_MAJOR, "stram")) { + put_disk(stram_disk); + return -ENXIO; + } + + stram_queue = blk_init_queue(do_stram_request, &stram_lock); + if (!stram_queue) { + unregister_blkdev(STRAM_MAJOR, "stram"); + put_disk(stram_disk); + return -ENOMEM; + } + + stram_disk->major = STRAM_MAJOR; + stram_disk->first_minor = STRAM_MINOR; + stram_disk->fops = &stram_fops; + stram_disk->queue = stram_queue; + sprintf(stram_disk->disk_name, "stram"); + set_capacity(stram_disk, (swap_end - swap_start)/512); + add_disk(stram_disk); + return 0; +} + + + +/* ------------------------------------------------------------------------ */ +/* Misc Utility Functions */ +/* ------------------------------------------------------------------------ */ + +/* reserve a range of pages */ +static void reserve_region(void *start, void *end) +{ + reserve_bootmem (virt_to_phys(start), end - start); +} + +#endif /* CONFIG_STRAM_SWAP */ + /* ------------------------------------------------------------------------ */ /* Region Management */ @@ -328,9 +1173,50 @@ int get_stram_list( char *buf ) { int len = 0; BLOCK *p; +#ifdef CONFIG_STRAM_SWAP + int i; + unsigned short *map = stram_swap_info->swap_map; + unsigned long max = stram_swap_info->max; + unsigned free = 0, used = 0, rsvd = 0; +#endif - PRINT_PROC("Total ST-RAM: %8u kB\n", +#ifdef CONFIG_STRAM_SWAP + if (max_swap_size) { + for( i = 1; i < max; ++i ) { + if (!map[i]) + ++free; + else if (map[i] == SWAP_MAP_BAD) + ++rsvd; + else + ++used; + } + PRINT_PROC( + "Total ST-RAM: %8u kB\n" + "Total ST-RAM swap: %8lu kB\n" + "Free swap: %8u kB\n" + "Used swap: %8u kB\n" + "Allocated swap: %8u kB\n" + "Swap Reads: %8u\n" + "Swap Writes: %8u\n" + "Swap Forced Reads: %8u\n", + (stram_end - stram_start) >> 10, + (max-1) << (PAGE_SHIFT-10), + free << (PAGE_SHIFT-10), + used << (PAGE_SHIFT-10), + rsvd << (PAGE_SHIFT-10), + stat_swap_read, + stat_swap_write, + stat_swap_force ); + } + else { +#endif + PRINT_PROC( "ST-RAM swapping disabled\n" ); + PRINT_PROC("Total ST-RAM: %8u kB\n", (stram_end - stram_start) >> 10); +#ifdef CONFIG_STRAM_SWAP + } +#endif + PRINT_PROC( "Allocated regions:\n" ); for( p = alloc_list; p; p = p->next ) { if (len + 50 >= PAGE_SIZE) @@ -341,6 +1227,8 @@ int get_stram_list( char *buf ) p->owner); if (p->flags & BLOCK_GFP) PRINT_PROC( "page-alloced)\n" ); + else if (p->flags & BLOCK_INSWAP) + PRINT_PROC( "in swap)\n" ); else PRINT_PROC( "??)\n" ); } diff --git a/trunk/arch/m68k/mm/kmap.c b/trunk/arch/m68k/mm/kmap.c index fe2383e36b06..5dcb3fa35ea9 100644 --- a/trunk/arch/m68k/mm/kmap.c +++ b/trunk/arch/m68k/mm/kmap.c @@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag) virtaddr += PTRTREESIZE; size -= PTRTREESIZE; } else { - pte_dir = pte_alloc_kernel(pmd_dir, virtaddr); + pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr); if (!pte_dir) { printk("ioremap: no mem for pte_dir\n"); return NULL; diff --git a/trunk/arch/m68k/sun3x/dvma.c b/trunk/arch/m68k/sun3x/dvma.c index 117481e86305..32e55adfeb8e 100644 --- a/trunk/arch/m68k/sun3x/dvma.c +++ b/trunk/arch/m68k/sun3x/dvma.c @@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr, pte_t *pte; unsigned long end3; - if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) { + if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) { ret = -ENOMEM; goto out; } diff --git a/trunk/arch/mips/kernel/irixelf.c b/trunk/arch/mips/kernel/irixelf.c index 7ce34d4aa220..99262fe64560 100644 --- a/trunk/arch/mips/kernel/irixelf.c +++ b/trunk/arch/mips/kernel/irixelf.c @@ -697,6 +697,7 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Do this so that we can load the interpreter, if need be. We will * change some of these later. */ + set_mm_counter(current->mm, rss, 0); setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); current->mm->start_stack = bprm->p; diff --git a/trunk/arch/mips/mm/ioremap.c b/trunk/arch/mips/mm/ioremap.c index 3101d1db5592..9c44ca70befa 100644 --- a/trunk/arch/mips/mm/ioremap.c +++ b/trunk/arch/mips/mm/ioremap.c @@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -77,6 +77,7 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pud_t *pud; pmd_t *pmd; @@ -95,6 +96,7 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/trunk/arch/parisc/kernel/cache.c b/trunk/arch/parisc/kernel/cache.c index a065349aee37..e15f09eaed12 100644 --- a/trunk/arch/parisc/kernel/cache.c +++ b/trunk/arch/parisc/kernel/cache.c @@ -270,6 +270,7 @@ void flush_dcache_page(struct page *page) unsigned long offset; unsigned long addr; pgoff_t pgoff; + pte_t *pte; unsigned long pfn = page_to_pfn(page); @@ -300,16 +301,21 @@ void flush_dcache_page(struct page *page) * taking a page fault if the pte doesn't exist. * This is just for speed. If the page translation * isn't there, there's no point exciting the - * nadtlb handler into a nullification frenzy. - * - * Make sure we really have this page: the private + * nadtlb handler into a nullification frenzy */ + + + if(!(pte = translation_exists(mpnt, addr))) + continue; + + /* make sure we really have this page: the private * mappings may cover this area but have COW'd this - * particular page. - */ - if (translation_exists(mpnt, addr, pfn)) { - __flush_cache_page(mpnt, addr); - break; - } + * particular page */ + if(pte_pfn(*pte) != pfn) + continue; + + __flush_cache_page(mpnt, addr); + + break; } flush_dcache_mmap_unlock(mapping); } diff --git a/trunk/arch/parisc/kernel/pci-dma.c b/trunk/arch/parisc/kernel/pci-dma.c index f94a02ef3d95..ae6213d71670 100644 --- a/trunk/arch/parisc/kernel/pci-dma.c +++ b/trunk/arch/parisc/kernel/pci-dma.c @@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr, if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_kernel(pmd, vaddr); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr); if (!pte) return -ENOMEM; if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr)) diff --git a/trunk/arch/parisc/mm/init.c b/trunk/arch/parisc/mm/init.c index 29b998e430e6..2886ad70db48 100644 --- a/trunk/arch/parisc/mm/init.c +++ b/trunk/arch/parisc/mm/init.c @@ -505,9 +505,7 @@ void show_mem(void) for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; - unsigned long flags; - pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -519,7 +517,6 @@ void show_mem(void) free++; else shared += page_count(p) - 1; - pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif diff --git a/trunk/arch/parisc/mm/ioremap.c b/trunk/arch/parisc/mm/ioremap.c index 5c7a1b3b9326..f2df502cdae3 100644 --- a/trunk/arch/parisc/mm/ioremap.c +++ b/trunk/arch/parisc/mm/ioremap.c @@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(NULL, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -75,9 +75,10 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc(dir, address); error = -ENOMEM; if (!pmd) break; @@ -88,6 +89,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/trunk/arch/ppc/kernel/dma-mapping.c b/trunk/arch/ppc/kernel/dma-mapping.c index 685fd0defe23..0f710d2baec6 100644 --- a/trunk/arch/ppc/kernel/dma-mapping.c +++ b/trunk/arch/ppc/kernel/dma-mapping.c @@ -335,6 +335,8 @@ static int __init dma_alloc_init(void) pte_t *pte; int ret = 0; + spin_lock(&init_mm.page_table_lock); + do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); @@ -345,7 +347,7 @@ static int __init dma_alloc_init(void) } WARN_ON(!pmd_none(*pmd)); - pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); + pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); if (!pte) { printk(KERN_ERR "%s: no pte tables\n", __func__); ret = -ENOMEM; @@ -355,6 +357,8 @@ static int __init dma_alloc_init(void) consistent_pte = pte; } while (0); + spin_unlock(&init_mm.page_table_lock); + return ret; } diff --git a/trunk/arch/ppc/mm/4xx_mmu.c b/trunk/arch/ppc/mm/4xx_mmu.c index 4d006aa1a0d1..b7bcbc232f39 100644 --- a/trunk/arch/ppc/mm/4xx_mmu.c +++ b/trunk/arch/ppc/mm/4xx_mmu.c @@ -110,11 +110,13 @@ unsigned long __init mmu_mapin_ram(void) pmd_t *pmdp; unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + spin_lock(&init_mm.page_table_lock); pmdp = pmd_offset(pgd_offset_k(v), v); pmd_val(*pmdp++) = val; pmd_val(*pmdp++) = val; pmd_val(*pmdp++) = val; pmd_val(*pmdp++) = val; + spin_unlock(&init_mm.page_table_lock); v += LARGE_PAGE_SIZE_16M; p += LARGE_PAGE_SIZE_16M; @@ -125,8 +127,10 @@ unsigned long __init mmu_mapin_ram(void) pmd_t *pmdp; unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + spin_lock(&init_mm.page_table_lock); pmdp = pmd_offset(pgd_offset_k(v), v); pmd_val(*pmdp) = val; + spin_unlock(&init_mm.page_table_lock); v += LARGE_PAGE_SIZE_4M; p += LARGE_PAGE_SIZE_4M; diff --git a/trunk/arch/ppc/mm/pgtable.c b/trunk/arch/ppc/mm/pgtable.c index 6ea9185fd120..43505b1fc5d8 100644 --- a/trunk/arch/ppc/mm/pgtable.c +++ b/trunk/arch/ppc/mm/pgtable.c @@ -280,16 +280,18 @@ map_page(unsigned long va, phys_addr_t pa, int flags) pte_t *pg; int err = -ENOMEM; + spin_lock(&init_mm.page_table_lock); /* Use upper 10 bits of VA to index the first level map */ pd = pmd_offset(pgd_offset_k(va), va); /* Use middle 10 bits of VA to index the second-level map */ - pg = pte_alloc_kernel(pd, va); + pg = pte_alloc_kernel(&init_mm, pd, va); if (pg != 0) { err = 0; set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); if (mem_init_done) flush_HPTE(0, va, pmd_val(*pd)); } + spin_unlock(&init_mm.page_table_lock); return err; } diff --git a/trunk/arch/ppc64/kernel/vdso.c b/trunk/arch/ppc64/kernel/vdso.c index 4aacf521e3e4..efa985f05aca 100644 --- a/trunk/arch/ppc64/kernel/vdso.c +++ b/trunk/arch/ppc64/kernel/vdso.c @@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma, return NOPAGE_SIGBUS; /* - * Last page is systemcfg. + * Last page is systemcfg, special handling here, no get_page() a + * this is a reserved page */ if ((vma->vm_end - address) <= PAGE_SIZE) - pg = virt_to_page(systemcfg); - else - pg = virt_to_page(vbase + offset); + return virt_to_page(systemcfg); + pg = virt_to_page(vbase + offset); get_page(pg); DBG(" ->page count: %d\n", page_count(pg)); @@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack) * gettimeofday will be totally dead. It's fine to use that for setting * breakpoints in the vDSO code pages though */ - vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED; + vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; vma->vm_flags |= mm->def_flags; vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; vma->vm_ops = &vdso_vmops; @@ -603,8 +603,6 @@ void __init vdso_init(void) ClearPageReserved(pg); get_page(pg); } - - get_page(virt_to_page(systemcfg)); } int in_gate_area_no_task(unsigned long addr) diff --git a/trunk/arch/ppc64/mm/imalloc.c b/trunk/arch/ppc64/mm/imalloc.c index f4ca29cf5364..c65b87b92756 100644 --- a/trunk/arch/ppc64/mm/imalloc.c +++ b/trunk/arch/ppc64/mm/imalloc.c @@ -300,7 +300,12 @@ void im_free(void * addr) for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { if (tmp->addr == addr) { *p = tmp->next; + + /* XXX: do we need the lock? */ + spin_lock(&init_mm.page_table_lock); unmap_vm_area(tmp); + spin_unlock(&init_mm.page_table_lock); + kfree(tmp); up(&imlist_sem); return; diff --git a/trunk/arch/ppc64/mm/init.c b/trunk/arch/ppc64/mm/init.c index e2bd7776622f..be64b157afce 100644 --- a/trunk/arch/ppc64/mm/init.c +++ b/trunk/arch/ppc64/mm/init.c @@ -104,8 +104,6 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long flags; - pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -116,7 +114,6 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } - pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -158,6 +155,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) unsigned long vsid; if (mem_init_done) { + spin_lock(&init_mm.page_table_lock); pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -165,11 +163,12 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); + ptep = pte_alloc_kernel(&init_mm, pmdp, ea); if (!ptep) return -ENOMEM; set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); + spin_unlock(&init_mm.page_table_lock); } else { unsigned long va, vpn, hash, hpteg; @@ -650,14 +649,11 @@ void __init mem_init(void) #endif for_each_pgdat(pgdat) { - unsigned long flags; - pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } - pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; @@ -871,80 +867,3 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); - -#ifdef CONFIG_MEMORY_HOTPLUG - -void online_page(struct page *page) -{ - ClearPageReserved(page); - free_cold_page(page); - totalram_pages++; - num_physpages++; -} - -/* - * This works only for the non-NUMA case. Later, we'll need a lookup - * to convert from real physical addresses to nid, that doesn't use - * pfn_to_nid(). - */ -int __devinit add_memory(u64 start, u64 size) -{ - struct pglist_data *pgdata = NODE_DATA(0); - struct zone *zone; - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones; - - return __add_pages(zone, start_pfn, nr_pages); - - return 0; -} - -/* - * First pass at this code will check to determine if the remove - * request is within the RMO. Do not allow removal within the RMO. - */ -int __devinit remove_memory(u64 start, u64 size) -{ - struct zone *zone; - unsigned long start_pfn, end_pfn, nr_pages; - - start_pfn = start >> PAGE_SHIFT; - nr_pages = size >> PAGE_SHIFT; - end_pfn = start_pfn + nr_pages; - - printk("%s(): Attempting to remove memoy in range " - "%lx to %lx\n", __func__, start, start+size); - /* - * check for range within RMO - */ - zone = page_zone(pfn_to_page(start_pfn)); - - printk("%s(): memory will be removed from " - "the %s zone\n", __func__, zone->name); - - /* - * not handling removing memory ranges that - * overlap multiple zones yet - */ - if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) - goto overlap; - - /* make sure it is NOT in RMO */ - if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { - printk("%s(): range to be removed must NOT be in RMO!\n", - __func__); - goto in_rmo; - } - - return __remove_pages(zone, start_pfn, nr_pages); - -overlap: - printk("%s(): memory range to be removed overlaps " - "multiple zones!!!\n", __func__); -in_rmo: - return -1; -} -#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/trunk/arch/s390/mm/ioremap.c b/trunk/arch/s390/mm/ioremap.c index 0f6e9ecbefe2..c6c39d868bc8 100644 --- a/trunk/arch/s390/mm/ioremap.c +++ b/trunk/arch/s390/mm/ioremap.c @@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -80,6 +80,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -93,6 +94,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return 0; } diff --git a/trunk/arch/sh/mm/fault.c b/trunk/arch/sh/mm/fault.c index 775f86cd3fe8..7abba2161da6 100644 --- a/trunk/arch/sh/mm/fault.c +++ b/trunk/arch/sh/mm/fault.c @@ -194,13 +194,10 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, unsigned long address) { unsigned long addrmax = P4SEG; - pgd_t *pgd; + pgd_t *dir; pmd_t *pmd; pte_t *pte; pte_t entry; - struct mm_struct *mm; - spinlock_t *ptl; - int ret = 1; #ifdef CONFIG_SH_KGDB if (kgdb_nofault && kgdb_bus_err_hook) @@ -211,28 +208,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, addrmax = P4SEG_STORE_QUE + 0x04000000; #endif - if (address >= P3SEG && address < addrmax) { - pgd = pgd_offset_k(address); - mm = NULL; - } else if (address >= TASK_SIZE) + if (address >= P3SEG && address < addrmax) + dir = pgd_offset_k(address); + else if (address >= TASK_SIZE) return 1; - else if (!(mm = current->mm)) + else if (!current->mm) return 1; else - pgd = pgd_offset(mm, address); + dir = pgd_offset(current->mm, address); - pmd = pmd_offset(pgd, address); - if (pmd_none_or_clear_bad(pmd)) + pmd = pmd_offset(dir, address); + if (pmd_none(*pmd)) return 1; - if (mm) - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - else - pte = pte_offset_kernel(pmd, address); - + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 1; + } + pte = pte_offset_kernel(pmd, address); entry = *pte; if (pte_none(entry) || pte_not_present(entry) || (writeaccess && !pte_write(entry))) - goto unlock; + return 1; if (writeaccess) entry = pte_mkdirty(entry); @@ -254,11 +251,8 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, set_pte(pte, entry); update_mmu_cache(NULL, address, entry); - ret = 0; -unlock: - if (mm) - pte_unmap_unlock(pte, ptl); - return ret; + + return 0; } void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) diff --git a/trunk/arch/sh/mm/hugetlbpage.c b/trunk/arch/sh/mm/hugetlbpage.c index 6b7a7688c98e..95bb1a6c6060 100644 --- a/trunk/arch/sh/mm/hugetlbpage.c +++ b/trunk/arch/sh/mm/hugetlbpage.c @@ -54,6 +54,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } +#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { diff --git a/trunk/arch/sh/mm/ioremap.c b/trunk/arch/sh/mm/ioremap.c index e794e27a72f1..9f490c2742f0 100644 --- a/trunk/arch/sh/mm/ioremap.c +++ b/trunk/arch/sh/mm/ioremap.c @@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -79,6 +79,7 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -92,6 +93,7 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/trunk/arch/sh64/mm/cache.c b/trunk/arch/sh64/mm/cache.c index c0c1b21350d8..3b87e25ea773 100644 --- a/trunk/arch/sh64/mm/cache.c +++ b/trunk/arch/sh64/mm/cache.c @@ -584,36 +584,32 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr) } } -static void sh64_dcache_purge_user_pages(struct mm_struct *mm, - unsigned long addr, unsigned long end) +static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr) { pgd_t *pgd; pmd_t *pmd; pte_t *pte; pte_t entry; - spinlock_t *ptl; unsigned long paddr; - if (!mm) - return; /* No way to find physical address of page */ - - pgd = pgd_offset(mm, addr); - if (pgd_bad(*pgd)) - return; - - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd) || pmd_bad(*pmd)) - return; - - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - do { - entry = *pte; - if (pte_none(entry) || !pte_present(entry)) - continue; - paddr = pte_val(entry) & PAGE_MASK; - sh64_dcache_purge_coloured_phy_page(paddr, addr); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); + /* NOTE : all the callers of this have mm->page_table_lock held, so the + following page table traversal is safe even on SMP/pre-emptible. */ + + if (!mm) return; /* No way to find physical address of page */ + pgd = pgd_offset(mm, eaddr); + if (pgd_bad(*pgd)) return; + + pmd = pmd_offset(pgd, eaddr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) return; + + pte = pte_offset_kernel(pmd, eaddr); + entry = *pte; + if (pte_none(entry) || !pte_present(entry)) return; + + paddr = pte_val(entry) & PAGE_MASK; + + sh64_dcache_purge_coloured_phy_page(paddr, eaddr); + } /****************************************************************************/ @@ -672,7 +668,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm, int n_pages; n_pages = ((end - start) >> PAGE_SHIFT); - if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) { + if (n_pages >= 64) { #if 1 sh64_dcache_purge_all(); #else @@ -711,10 +707,20 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm, } #endif } else { - /* Small range, covered by a single page table page */ - start &= PAGE_MASK; /* should already be so */ - end = PAGE_ALIGN(end); /* should already be so */ - sh64_dcache_purge_user_pages(mm, start, end); + /* 'Small' range */ + unsigned long aligned_start; + unsigned long eaddr; + unsigned long last_page_start; + + aligned_start = start & PAGE_MASK; + /* 'end' is 1 byte beyond the end of the range */ + last_page_start = (end - 1) & PAGE_MASK; + + eaddr = aligned_start; + while (eaddr <= last_page_start) { + sh64_dcache_purge_user_page(mm, eaddr); + eaddr += PAGE_SIZE; + } } return; } @@ -874,7 +880,9 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, addresses from the user address space specified by mm, after writing back any dirty data. - Note, 'end' is 1 byte beyond the end of the range to flush. */ + Note(1), 'end' is 1 byte beyond the end of the range to flush. + + Note(2), this is called with mm->page_table_lock held.*/ sh64_dcache_purge_user_range(mm, start, end); sh64_icache_inv_user_page_range(mm, start, end); @@ -890,7 +898,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned the I-cache must be searched too in case the page in question is both writable and being executed from (e.g. stack trampolines.) - Note, this is called with pte lock held. + Note(1), this is called with mm->page_table_lock held. */ sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT); diff --git a/trunk/arch/sh64/mm/hugetlbpage.c b/trunk/arch/sh64/mm/hugetlbpage.c index ed6a505b3ee2..dcd9c8a8baf8 100644 --- a/trunk/arch/sh64/mm/hugetlbpage.c +++ b/trunk/arch/sh64/mm/hugetlbpage.c @@ -54,31 +54,41 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) + +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, int write_access) { - int i; + unsigned long i; + pte_t entry; + + add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + + if (write_access) + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, + vma->vm_page_prot))); + else + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = pte_mkyoung(entry); + mk_pte_huge(entry); for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte_at(mm, addr, ptep, entry); - ptep++; - addr += PAGE_SIZE; + set_pte(page_table, entry); + page_table++; + pte_val(entry) += PAGE_SIZE; } } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) +pte_t huge_ptep_get_and_clear(pte_t *ptep) { pte_t entry; - int i; entry = *ptep; for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - pte_clear(mm, addr, ptep); - addr += PAGE_SIZE; - ptep++; + pte_clear(pte); + pte++; } return entry; @@ -96,6 +106,79 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + int i; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + BUG_ON(!src_pte || pte_none(*src_pte)); + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + set_pte(dst_pte, entry); + pte_val(entry) += PAGE_SIZE; + dst_pte++; + } + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + addr += HPAGE_SIZE; + } + return 0; + +nomem: + return -ENOMEM; +} + +int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, int *length, int i) +{ + unsigned long vaddr = *position; + int remainder = *length; + + WARN_ON(!is_vm_hugetlb_page(vma)); + + while (vaddr < vma->vm_end && remainder) { + if (pages) { + pte_t *pte; + struct page *page; + + pte = huge_pte_offset(mm, vaddr); + + /* hugetlb should be locked, and hence, prefaulted */ + BUG_ON(!pte || pte_none(*pte)); + + page = pte_page(*pte); + + WARN_ON(!PageCompound(page)); + + get_page(page); + pages[i] = page; + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + --remainder; + ++i; + } + + *length = remainder; + *position = vaddr; + + return i; +} + struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -112,3 +195,84 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } + +void unmap_hugepage_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + struct page *page; + int i; + + BUG_ON(start & (HPAGE_SIZE - 1)); + BUG_ON(end & (HPAGE_SIZE - 1)); + + for (address = start; address < end; address += HPAGE_SIZE) { + pte = huge_pte_offset(mm, address); + BUG_ON(!pte); + if (pte_none(*pte)) + continue; + page = pte_page(*pte); + put_page(page); + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + pte_clear(mm, address+(i*PAGE_SIZE), pte); + pte++; + } + } + add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); + flush_tlb_range(vma, start, end); +} + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + if (!pte_none(*pte)) + continue; + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + if (! ret) { + unlock_page(page); + } else { + hugetlb_put_quota(mapping); + free_huge_page(page); + goto out; + } + } + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} diff --git a/trunk/arch/sh64/mm/ioremap.c b/trunk/arch/sh64/mm/ioremap.c index fb1866fa2c9d..f4003da556bc 100644 --- a/trunk/arch/sh64/mm/ioremap.c +++ b/trunk/arch/sh64/mm/ioremap.c @@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -101,6 +101,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd = pmd_alloc(&init_mm, dir, address); error = -ENOMEM; @@ -114,6 +115,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return 0; } diff --git a/trunk/arch/sparc/mm/generic.c b/trunk/arch/sparc/mm/generic.c index 9604893ffdbd..20ccb957fb77 100644 --- a/trunk/arch/sparc/mm/generic.c +++ b/trunk/arch/sparc/mm/generic.c @@ -73,16 +73,14 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, int space = GET_IOSPACE(pfn); unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; - /* See comment in mm/memory.c remap_pfn_range */ - vma->vm_flags |= VM_IO | VM_RESERVED; - prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); + spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; @@ -92,6 +90,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } + spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, beg, end); return error; diff --git a/trunk/arch/sparc64/kernel/binfmt_aout32.c b/trunk/arch/sparc64/kernel/binfmt_aout32.c index edf52d06b280..b2854ef221d0 100644 --- a/trunk/arch/sparc64/kernel/binfmt_aout32.c +++ b/trunk/arch/sparc64/kernel/binfmt_aout32.c @@ -241,6 +241,7 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); + set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/trunk/arch/sparc64/mm/generic.c b/trunk/arch/sparc64/mm/generic.c index 112c316e7cd2..c954d91f01d0 100644 --- a/trunk/arch/sparc64/mm/generic.c +++ b/trunk/arch/sparc64/mm/generic.c @@ -127,16 +127,14 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, int space = GET_IOSPACE(pfn); unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; - /* See comment in mm/memory.c remap_pfn_range */ - vma->vm_flags |= VM_IO | VM_RESERVED; - prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); + spin_lock(&mm->page_table_lock); while (from < end) { - pud_t *pud = pud_alloc(mm, dir, from); + pud_t *pud = pud_alloc(current->mm, dir, from); error = -ENOMEM; if (!pud) break; @@ -146,7 +144,8 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } - flush_tlb_range(vma, beg, end); + spin_unlock(&mm->page_table_lock); + return error; } diff --git a/trunk/arch/sparc64/mm/tlb.c b/trunk/arch/sparc64/mm/tlb.c index 8b104be4662b..90ca99d0b89c 100644 --- a/trunk/arch/sparc64/mm/tlb.c +++ b/trunk/arch/sparc64/mm/tlb.c @@ -18,7 +18,8 @@ /* Heavily inspired by the ppc64 code. */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, }; +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = + { NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, }; void flush_tlb_pending(void) { @@ -71,7 +72,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t no_cache_flush: - if (mp->fullmm) + if (mp->tlb_frozen) return; nr = mp->tlb_nr; @@ -96,7 +97,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long unsigned long nr = mp->tlb_nr; long s = start, e = end, vpte_base; - if (mp->fullmm) + if (mp->tlb_frozen) return; /* If start is greater than end, that is a real problem. */ diff --git a/trunk/arch/um/include/tlb.h b/trunk/arch/um/include/tlb.h index 8efc1e0f1b84..45d7da6c3b2c 100644 --- a/trunk/arch/um/include/tlb.h +++ b/trunk/arch/um/include/tlb.h @@ -34,6 +34,7 @@ struct host_vm_op { } u; }; +extern void mprotect_kernel_vm(int w); extern void force_flush_all(void); extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr, int force, diff --git a/trunk/arch/um/kernel/process_kern.c b/trunk/arch/um/kernel/process_kern.c index 34b54a3e2132..0d73ceeece72 100644 --- a/trunk/arch/um/kernel/process_kern.c +++ b/trunk/arch/um/kernel/process_kern.c @@ -222,7 +222,6 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr, pud_t *pud; pmd_t *pmd; pte_t *pte; - pte_t ptent; if(task->mm == NULL) return(ERR_PTR(-EINVAL)); @@ -239,13 +238,12 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr, return(ERR_PTR(-EINVAL)); pte = pte_offset_kernel(pmd, addr); - ptent = *pte; - if(!pte_present(ptent)) + if(!pte_present(*pte)) return(ERR_PTR(-EINVAL)); if(pte_out != NULL) - *pte_out = ptent; - return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK)); + *pte_out = *pte; + return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK)); } char *current_cmd(void) diff --git a/trunk/arch/um/kernel/skas/mmu.c b/trunk/arch/um/kernel/skas/mmu.c index 9e5e39cea821..240143b616a2 100644 --- a/trunk/arch/um/kernel/skas/mmu.c +++ b/trunk/arch/um/kernel/skas/mmu.c @@ -28,6 +28,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, pmd_t *pmd; pte_t *pte; + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, proc); pud = pud_alloc(mm, pgd, proc); if (!pud) @@ -62,6 +63,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); *pte = pte_mkexec(*pte); *pte = pte_wrprotect(*pte); + spin_unlock(&mm->page_table_lock); return(0); out_pmd: @@ -69,6 +71,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, out_pte: pmd_free(pmd); out: + spin_unlock(&mm->page_table_lock); return(-ENOMEM); } @@ -144,7 +147,6 @@ void destroy_context_skas(struct mm_struct *mm) if(!proc_mm || !ptrace_faultinfo){ free_page(mmu->id.stack); - pte_lock_deinit(virt_to_page(mmu->last_page_table)); pte_free_kernel((pte_t *) mmu->last_page_table); dec_page_state(nr_page_table_pages); #ifdef CONFIG_3_LEVEL_PGTABLES diff --git a/trunk/arch/um/kernel/tt/tlb.c b/trunk/arch/um/kernel/tt/tlb.c index ae6217c86135..f1d85dbb45b9 100644 --- a/trunk/arch/um/kernel/tt/tlb.c +++ b/trunk/arch/um/kernel/tt/tlb.c @@ -74,6 +74,42 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end) atomic_inc(&vmchange_seq); } +static void protect_vm_page(unsigned long addr, int w, int must_succeed) +{ + int err; + + err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed); + if(err == 0) return; + else if((err == -EFAULT) || (err == -ENOMEM)){ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + protect_vm_page(addr, w, 1); + } + else panic("protect_vm_page : protect failed, errno = %d\n", err); +} + +void mprotect_kernel_vm(int w) +{ + struct mm_struct *mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long addr; + + mm = &init_mm; + for(addr = start_vm; addr < end_vm;){ + pgd = pgd_offset(mm, addr); + pud = pud_offset(pgd, addr); + pmd = pmd_offset(pud, addr); + if(pmd_present(*pmd)){ + pte = pte_offset_kernel(pmd, addr); + if(pte_present(*pte)) protect_vm_page(addr, w, 0); + addr += PAGE_SIZE; + } + else addr += PMD_SIZE; + } +} + void flush_tlb_kernel_vm_tt(void) { flush_tlb_kernel_range(start_vm, end_vm); diff --git a/trunk/arch/x86_64/ia32/ia32_aout.c b/trunk/arch/x86_64/ia32/ia32_aout.c index 93c60f4aa47a..3e6780fa0186 100644 --- a/trunk/arch/x86_64/ia32/ia32_aout.c +++ b/trunk/arch/x86_64/ia32/ia32_aout.c @@ -314,6 +314,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; + set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/trunk/arch/x86_64/mm/ioremap.c b/trunk/arch/x86_64/mm/ioremap.c index ecf7acb5db9b..6972df480d2b 100644 --- a/trunk/arch/x86_64/mm/ioremap.c +++ b/trunk/arch/x86_64/mm/ioremap.c @@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -105,6 +105,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { pud_t *pud; pud = pud_alloc(&init_mm, pgd, address); @@ -118,6 +119,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; pgd++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/trunk/drivers/acpi/acpi_memhotplug.c b/trunk/drivers/acpi/acpi_memhotplug.c index 2143609d2936..01a1bd239263 100644 --- a/trunk/drivers/acpi/acpi_memhotplug.c +++ b/trunk/drivers/acpi/acpi_memhotplug.c @@ -200,7 +200,8 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) * Note: Assume that this function returns zero on success */ result = add_memory(mem_device->start_addr, - (mem_device->end_addr - mem_device->start_addr) + 1); + (mem_device->end_addr - mem_device->start_addr) + 1, + mem_device->read_write_attribute); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); mem_device->state = MEMORY_INVALID_STATE; @@ -258,7 +259,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) * Ask the VM to offline this memory range. * Note: Assume that this function returns zero on success */ - result = remove_memory(start, len); + result = remove_memory(start, len, attr); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); return_VALUE(result); diff --git a/trunk/drivers/base/Makefile b/trunk/drivers/base/Makefile index f12898d53078..66d9c4643fc1 100644 --- a/trunk/drivers/base/Makefile +++ b/trunk/drivers/base/Makefile @@ -7,7 +7,6 @@ obj-y := core.o sys.o bus.o dd.o \ obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG diff --git a/trunk/drivers/base/init.c b/trunk/drivers/base/init.c index c648914b9cde..84e604e25c4f 100644 --- a/trunk/drivers/base/init.c +++ b/trunk/drivers/base/init.c @@ -9,7 +9,6 @@ #include #include -#include #include "base.h" @@ -34,6 +33,5 @@ void __init driver_init(void) platform_bus_init(); system_bus_init(); cpu_dev_init(); - memory_dev_init(); attribute_container_init(); } diff --git a/trunk/drivers/base/memory.c b/trunk/drivers/base/memory.c deleted file mode 100644 index b7ddd651d664..000000000000 --- a/trunk/drivers/base/memory.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * drivers/base/memory.c - basic Memory class support - * - * Written by Matt Tolentino - * Dave Hansen - * - * This file provides the necessary infrastructure to represent - * a SPARSEMEM-memory-model system's physical memory in /sysfs. - * All arch-independent code that assumes MEMORY_HOTPLUG requires - * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. - */ - -#include -#include -#include -#include /* capable() */ -#include -#include -#include -#include -#include -#include -#include -#include - -#define MEMORY_CLASS_NAME "memory" - -static struct sysdev_class memory_sysdev_class = { - set_kset_name(MEMORY_CLASS_NAME), -}; -EXPORT_SYMBOL(memory_sysdev_class); - -static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) -{ - return MEMORY_CLASS_NAME; -} - -static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, - int num_envp, char *buffer, int buffer_size) -{ - int retval = 0; - - return retval; -} - -static struct kset_hotplug_ops memory_hotplug_ops = { - .name = memory_hotplug_name, - .hotplug = memory_hotplug, -}; - -static struct notifier_block *memory_chain; - -static int register_memory_notifier(struct notifier_block *nb) -{ - return notifier_chain_register(&memory_chain, nb); -} - -static void unregister_memory_notifier(struct notifier_block *nb) -{ - notifier_chain_unregister(&memory_chain, nb); -} - -/* - * register_memory - Setup a sysfs device for a memory block - */ -static int -register_memory(struct memory_block *memory, struct mem_section *section, - struct node *root) -{ - int error; - - memory->sysdev.cls = &memory_sysdev_class; - memory->sysdev.id = __section_nr(section); - - error = sysdev_register(&memory->sysdev); - - if (root && !error) - error = sysfs_create_link(&root->sysdev.kobj, - &memory->sysdev.kobj, - kobject_name(&memory->sysdev.kobj)); - - return error; -} - -static void -unregister_memory(struct memory_block *memory, struct mem_section *section, - struct node *root) -{ - BUG_ON(memory->sysdev.cls != &memory_sysdev_class); - BUG_ON(memory->sysdev.id != __section_nr(section)); - - sysdev_unregister(&memory->sysdev); - if (root) - sysfs_remove_link(&root->sysdev.kobj, - kobject_name(&memory->sysdev.kobj)); -} - -/* - * use this as the physical section index that this memsection - * uses. - */ - -static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) -{ - struct memory_block *mem = - container_of(dev, struct memory_block, sysdev); - return sprintf(buf, "%08lx\n", mem->phys_index); -} - -/* - * online, offline, going offline, etc. - */ -static ssize_t show_mem_state(struct sys_device *dev, char *buf) -{ - struct memory_block *mem = - container_of(dev, struct memory_block, sysdev); - ssize_t len = 0; - - /* - * We can probably put these states in a nice little array - * so that they're not open-coded - */ - switch (mem->state) { - case MEM_ONLINE: - len = sprintf(buf, "online\n"); - break; - case MEM_OFFLINE: - len = sprintf(buf, "offline\n"); - break; - case MEM_GOING_OFFLINE: - len = sprintf(buf, "going-offline\n"); - break; - default: - len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", - mem->state); - WARN_ON(1); - break; - } - - return len; -} - -static inline int memory_notify(unsigned long val, void *v) -{ - return notifier_call_chain(&memory_chain, val, v); -} - -/* - * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is - * OK to have direct references to sparsemem variables in here. - */ -static int -memory_block_action(struct memory_block *mem, unsigned long action) -{ - int i; - unsigned long psection; - unsigned long start_pfn, start_paddr; - struct page *first_page; - int ret; - int old_state = mem->state; - - psection = mem->phys_index; - first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); - - /* - * The probe routines leave the pages reserved, just - * as the bootmem code does. Make sure they're still - * that way. - */ - if (action == MEM_ONLINE) { - for (i = 0; i < PAGES_PER_SECTION; i++) { - if (PageReserved(first_page+i)) - continue; - - printk(KERN_WARNING "section number %ld page number %d " - "not reserved, was it already online? \n", - psection, i); - return -EBUSY; - } - } - - switch (action) { - case MEM_ONLINE: - start_pfn = page_to_pfn(first_page); - ret = online_pages(start_pfn, PAGES_PER_SECTION); - break; - case MEM_OFFLINE: - mem->state = MEM_GOING_OFFLINE; - memory_notify(MEM_GOING_OFFLINE, NULL); - start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; - ret = remove_memory(start_paddr, - PAGES_PER_SECTION << PAGE_SHIFT); - if (ret) { - mem->state = old_state; - break; - } - memory_notify(MEM_MAPPING_INVALID, NULL); - break; - default: - printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", - __FUNCTION__, mem, action, action); - WARN_ON(1); - ret = -EINVAL; - } - /* - * For now, only notify on successful memory operations - */ - if (!ret) - memory_notify(action, NULL); - - return ret; -} - -static int memory_block_change_state(struct memory_block *mem, - unsigned long to_state, unsigned long from_state_req) -{ - int ret = 0; - down(&mem->state_sem); - - if (mem->state != from_state_req) { - ret = -EINVAL; - goto out; - } - - ret = memory_block_action(mem, to_state); - if (!ret) - mem->state = to_state; - -out: - up(&mem->state_sem); - return ret; -} - -static ssize_t -store_mem_state(struct sys_device *dev, const char *buf, size_t count) -{ - struct memory_block *mem; - unsigned int phys_section_nr; - int ret = -EINVAL; - - mem = container_of(dev, struct memory_block, sysdev); - phys_section_nr = mem->phys_index; - - if (!valid_section_nr(phys_section_nr)) - goto out; - - if (!strncmp(buf, "online", min((int)count, 6))) - ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); - else if(!strncmp(buf, "offline", min((int)count, 7))) - ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); -out: - if (ret) - return ret; - return count; -} - -/* - * phys_device is a bad name for this. What I really want - * is a way to differentiate between memory ranges that - * are part of physical devices that constitute - * a complete removable unit or fru. - * i.e. do these ranges belong to the same physical device, - * s.t. if I offline all of these sections I can then - * remove the physical device? - */ -static ssize_t show_phys_device(struct sys_device *dev, char *buf) -{ - struct memory_block *mem = - container_of(dev, struct memory_block, sysdev); - return sprintf(buf, "%d\n", mem->phys_device); -} - -static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); -static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); -static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); - -#define mem_create_simple_file(mem, attr_name) \ - sysdev_create_file(&mem->sysdev, &attr_##attr_name) -#define mem_remove_simple_file(mem, attr_name) \ - sysdev_remove_file(&mem->sysdev, &attr_##attr_name) - -/* - * Block size attribute stuff - */ -static ssize_t -print_block_size(struct class *class, char *buf) -{ - return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); -} - -static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); - -static int block_size_init(void) -{ - sysfs_create_file(&memory_sysdev_class.kset.kobj, - &class_attr_block_size_bytes.attr); - return 0; -} - -/* - * Some architectures will have custom drivers to do this, and - * will not need to do it from userspace. The fake hot-add code - * as well as ppc64 will do all of their discovery in userspace - * and will require this interface. - */ -#ifdef CONFIG_ARCH_MEMORY_PROBE -static ssize_t -memory_probe_store(struct class *class, const char __user *buf, size_t count) -{ - u64 phys_addr; - int ret; - - phys_addr = simple_strtoull(buf, NULL, 0); - - ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); - - if (ret) - count = ret; - - return count; -} -static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); - -static int memory_probe_init(void) -{ - sysfs_create_file(&memory_sysdev_class.kset.kobj, - &class_attr_probe.attr); - return 0; -} -#else -#define memory_probe_init(...) do {} while (0) -#endif - -/* - * Note that phys_device is optional. It is here to allow for - * differentiation between which *physical* devices each - * section belongs to... - */ - -static int add_memory_block(unsigned long node_id, struct mem_section *section, - unsigned long state, int phys_device) -{ - struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); - int ret = 0; - - if (!mem) - return -ENOMEM; - - mem->phys_index = __section_nr(section); - mem->state = state; - init_MUTEX(&mem->state_sem); - mem->phys_device = phys_device; - - ret = register_memory(mem, section, NULL); - if (!ret) - ret = mem_create_simple_file(mem, phys_index); - if (!ret) - ret = mem_create_simple_file(mem, state); - if (!ret) - ret = mem_create_simple_file(mem, phys_device); - - return ret; -} - -/* - * For now, we have a linear search to go find the appropriate - * memory_block corresponding to a particular phys_index. If - * this gets to be a real problem, we can always use a radix - * tree or something here. - * - * This could be made generic for all sysdev classes. - */ -static struct memory_block *find_memory_block(struct mem_section *section) -{ - struct kobject *kobj; - struct sys_device *sysdev; - struct memory_block *mem; - char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; - - /* - * This only works because we know that section == sysdev->id - * slightly redundant with sysdev_register() - */ - sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); - - kobj = kset_find_obj(&memory_sysdev_class.kset, name); - if (!kobj) - return NULL; - - sysdev = container_of(kobj, struct sys_device, kobj); - mem = container_of(sysdev, struct memory_block, sysdev); - - return mem; -} - -int remove_memory_block(unsigned long node_id, struct mem_section *section, - int phys_device) -{ - struct memory_block *mem; - - mem = find_memory_block(section); - mem_remove_simple_file(mem, phys_index); - mem_remove_simple_file(mem, state); - mem_remove_simple_file(mem, phys_device); - unregister_memory(mem, section, NULL); - - return 0; -} - -/* - * need an interface for the VM to add new memory regions, - * but without onlining it. - */ -int register_new_memory(struct mem_section *section) -{ - return add_memory_block(0, section, MEM_OFFLINE, 0); -} - -int unregister_memory_section(struct mem_section *section) -{ - if (!valid_section(section)) - return -EINVAL; - - return remove_memory_block(0, section, 0); -} - -/* - * Initialize the sysfs support for memory devices... - */ -int __init memory_dev_init(void) -{ - unsigned int i; - int ret; - - memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; - ret = sysdev_class_register(&memory_sysdev_class); - - /* - * Create entries for memory sections that were found - * during boot and have been initialized - */ - for (i = 0; i < NR_MEM_SECTIONS; i++) { - if (!valid_section_nr(i)) - continue; - add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); - } - - memory_probe_init(); - block_size_init(); - - return ret; -} diff --git a/trunk/drivers/scsi/sg.c b/trunk/drivers/scsi/sg.c index 2d30b46806bf..861e51375d70 100644 --- a/trunk/drivers/scsi/sg.c +++ b/trunk/drivers/scsi/sg.c @@ -1886,17 +1886,13 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages, int i; for (i=0; i < nr_pages; i++) { - struct page *page = sgl[i].page; - - /* XXX: just for debug. Remove when PageReserved is removed */ - BUG_ON(PageReserved(page)); - if (dirtied) - SetPageDirty(page); - /* unlock_page(page); */ + if (dirtied && !PageReserved(sgl[i].page)) + SetPageDirty(sgl[i].page); + /* unlock_page(sgl[i].page); */ /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(page); + page_cache_release(sgl[i].page); } return 0; diff --git a/trunk/drivers/scsi/st.c b/trunk/drivers/scsi/st.c index da9766283bd7..5eb54d8019b4 100644 --- a/trunk/drivers/scsi/st.c +++ b/trunk/drivers/scsi/st.c @@ -4526,16 +4526,12 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p int i; for (i=0; i < nr_pages; i++) { - struct page *page = sgl[i].page; - - /* XXX: just for debug. Remove when PageReserved is removed */ - BUG_ON(PageReserved(page)); - if (dirtied) - SetPageDirty(page); + if (dirtied && !PageReserved(sgl[i].page)) + SetPageDirty(sgl[i].page); /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(page); + page_cache_release(sgl[i].page); } return 0; diff --git a/trunk/fs/afs/file.c b/trunk/fs/afs/file.c index 4975c9c193dd..0d576987ec67 100644 --- a/trunk/fs/afs/file.c +++ b/trunk/fs/afs/file.c @@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) cachefs_uncache_page(vnode->cache, page); #endif - pageio = (struct cachefs_page *) page_private(page); - set_page_private(page, 0); + pageio = (struct cachefs_page *) page->private; + page->private = 0; ClearPagePrivate(page); if (pageio) diff --git a/trunk/fs/binfmt_aout.c b/trunk/fs/binfmt_aout.c index 72011826f0cb..dd9baabaf016 100644 --- a/trunk/fs/binfmt_aout.c +++ b/trunk/fs/binfmt_aout.c @@ -318,6 +318,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; + set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/trunk/fs/binfmt_elf.c b/trunk/fs/binfmt_elf.c index 918ccc267e41..d4b15576e584 100644 --- a/trunk/fs/binfmt_elf.c +++ b/trunk/fs/binfmt_elf.c @@ -773,6 +773,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ + set_mm_counter(current->mm, rss, 0); current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), diff --git a/trunk/fs/binfmt_elf_fdpic.c b/trunk/fs/binfmt_elf_fdpic.c index dda87c4c82a3..134c9c0d1f54 100644 --- a/trunk/fs/binfmt_elf_fdpic.c +++ b/trunk/fs/binfmt_elf_fdpic.c @@ -294,7 +294,14 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs &interp_params, ¤t->mm->start_stack, ¤t->mm->start_brk); +#endif + + /* do this so that we can load the interpreter, if need be + * - we will change some of these later + */ + set_mm_counter(current->mm, rss, 0); +#ifdef CONFIG_MMU retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); if (retval < 0) { send_sig(SIGKILL, current, 0); diff --git a/trunk/fs/binfmt_flat.c b/trunk/fs/binfmt_flat.c index 9d6625829b99..7974efa107bc 100644 --- a/trunk/fs/binfmt_flat.c +++ b/trunk/fs/binfmt_flat.c @@ -650,6 +650,7 @@ static int load_flat_file(struct linux_binprm * bprm, current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; + set_mm_counter(current->mm, rss, 0); } if (flags & FLAT_FLAG_KTRACE) diff --git a/trunk/fs/binfmt_som.c b/trunk/fs/binfmt_som.c index 00a91dc25d16..227a2682d2bf 100644 --- a/trunk/fs/binfmt_som.c +++ b/trunk/fs/binfmt_som.c @@ -259,6 +259,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) create_som_tables(bprm); current->mm->start_stack = bprm->p; + set_mm_counter(current->mm, rss, 0); #if 0 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); diff --git a/trunk/fs/buffer.c b/trunk/fs/buffer.c index 2066e4cb700c..b1667986442f 100644 --- a/trunk/fs/buffer.c +++ b/trunk/fs/buffer.c @@ -96,7 +96,7 @@ static void __clear_page_buffers(struct page *page) { ClearPagePrivate(page); - set_page_private(page, 0); + page->private = 0; page_cache_release(page); } diff --git a/trunk/fs/compat.c b/trunk/fs/compat.c index 8e71cdbecc7c..a719e158e002 100644 --- a/trunk/fs/compat.c +++ b/trunk/fs/compat.c @@ -1490,6 +1490,7 @@ int compat_do_execve(char * filename, /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); + update_mem_hiwater(current); kfree(bprm); return retval; } diff --git a/trunk/fs/direct-io.c b/trunk/fs/direct-io.c index 3931e7f1e6bf..0d06097bc995 100644 --- a/trunk/fs/direct-io.c +++ b/trunk/fs/direct-io.c @@ -162,7 +162,6 @@ static int dio_refill_pages(struct dio *dio) up_read(¤t->mm->mmap_sem); if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { - struct page *page = ZERO_PAGE(dio->curr_user_address); /* * A memory fault, but the filesystem has some outstanding * mapped blocks. We need to use those blocks up to avoid @@ -170,8 +169,7 @@ static int dio_refill_pages(struct dio *dio) */ if (dio->page_errors == 0) dio->page_errors = ret; - page_cache_get(page); - dio->pages[0] = page; + dio->pages[0] = ZERO_PAGE(dio->curr_user_address); dio->head = 0; dio->tail = 1; ret = 0; diff --git a/trunk/fs/exec.c b/trunk/fs/exec.c index ba73797eb4cb..d2208f7c87db 100644 --- a/trunk/fs/exec.c +++ b/trunk/fs/exec.c @@ -309,36 +309,40 @@ void install_arg_page(struct vm_area_struct *vma, pud_t * pud; pmd_t * pmd; pte_t * pte; - spinlock_t *ptl; if (unlikely(anon_vma_prepare(vma))) - goto out; + goto out_sig; flush_dcache_page(page); pgd = pgd_offset(mm, address); + + spin_lock(&mm->page_table_lock); pud = pud_alloc(mm, pgd, address); if (!pud) goto out; pmd = pmd_alloc(mm, pud, address); if (!pmd) goto out; - pte = pte_alloc_map_lock(mm, pmd, address, &ptl); + pte = pte_alloc_map(mm, pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { - pte_unmap_unlock(pte, ptl); + pte_unmap(pte); goto out; } - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, rss); lru_cache_add_active(page); set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); page_add_anon_rmap(page, vma, address); - pte_unmap_unlock(pte, ptl); + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ return; out: + spin_unlock(&mm->page_table_lock); +out_sig: __free_page(page); force_sig(SIGKILL, current); } @@ -1203,6 +1207,7 @@ int do_execve(char * filename, /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); + update_mem_hiwater(current); kfree(bprm); return retval; } diff --git a/trunk/fs/hugetlbfs/inode.c b/trunk/fs/hugetlbfs/inode.c index e026c807e6b3..3a9b6d179cbd 100644 --- a/trunk/fs/hugetlbfs/inode.c +++ b/trunk/fs/hugetlbfs/inode.c @@ -45,58 +45,10 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = { int sysctl_hugetlb_shm_group; -static void huge_pagevec_release(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); ++i) - put_page(pvec->pages[i]); - - pagevec_reinit(pvec); -} - -/* - * huge_pages_needed tries to determine the number of new huge pages that - * will be required to fully populate this VMA. This will be equal to - * the size of the VMA in huge pages minus the number of huge pages - * (covered by this VMA) that are found in the page cache. - * - * Result is in bytes to be compatible with is_hugepage_mem_enough() - */ -unsigned long -huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) -{ - int i; - struct pagevec pvec; - unsigned long start = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff; - pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); - - pagevec_init(&pvec, 0); - while (next < endpg) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (page->index > next) - next = page->index; - if (page->index >= endpg) - break; - next++; - hugepages--; - } - huge_pagevec_release(&pvec); - } - return hugepages << HPAGE_SHIFT; -} - static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; - unsigned long bytes; loff_t len, vma_len; int ret; @@ -115,10 +67,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; - bytes = huge_pages_needed(mapping, vma); - if (!is_hugepage_mem_enough(bytes)) - return -ENOMEM; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); down(&inode->i_sem); @@ -131,8 +79,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) goto out; - ret = 0; - hugetlb_prefault_arch_hook(vma->vm_mm); + ret = hugetlb_prefault(mapping, vma); + if (ret) + goto out; + if (inode->i_size < len) inode->i_size = len; out: @@ -142,7 +92,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) } /* - * Called under down_write(mmap_sem). + * Called under down_write(mmap_sem), page_table_lock is not held */ #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA @@ -221,6 +171,16 @@ static int hugetlbfs_commit_write(struct file *file, return -EINVAL; } +static void huge_pagevec_release(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); ++i) + put_page(pvec->pages[i]); + + pagevec_reinit(pvec); +} + static void truncate_huge_page(struct page *page) { clear_page_dirty(page); @@ -264,35 +224,52 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); + + hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); + inode->i_state |= I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) truncate_hugepages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + clear_inode(inode); + destroy_inode(inode); } static void hugetlbfs_forget_inode(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct super_block *super_block = inode->i_sb; + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block); - if (!hlist_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_LOCK))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; - if (!sb || (sb->s_flags & MS_ACTIVE)) { - spin_unlock(&inode_lock); - return; - } - inode->i_state |= I_WILL_FREE; + if (hlist_unhashed(&inode->i_hash)) + goto out_truncate; + + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + if (!super_block || (super_block->s_flags & MS_ACTIVE)) { spin_unlock(&inode_lock); - /* - * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK - * in our backing_dev_info. - */ - write_inode_now(inode, 1); - spin_lock(&inode_lock); - inode->i_state &= ~I_WILL_FREE; - inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); + return; } + + /* write_inode_now() ? */ + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); +out_truncate: list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; @@ -300,6 +277,13 @@ static void hugetlbfs_forget_inode(struct inode *inode) spin_unlock(&inode_lock); if (inode->i_data.nrpages) truncate_hugepages(&inode->i_data, 0); + + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + clear_inode(inode); destroy_inode(inode); } @@ -307,7 +291,7 @@ static void hugetlbfs_forget_inode(struct inode *inode) static void hugetlbfs_drop_inode(struct inode *inode) { if (!inode->i_nlink) - generic_delete_inode(inode); + hugetlbfs_delete_inode(inode); else hugetlbfs_forget_inode(inode); } @@ -324,6 +308,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { unsigned long h_vm_pgoff; + unsigned long v_length; unsigned long v_offset; h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); @@ -334,8 +319,11 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) if (h_vm_pgoff >= h_pgoff) v_offset = 0; - unmap_hugepage_range(vma, - vma->vm_start + v_offset, vma->vm_end); + v_length = vma->vm_end - vma->vm_start; + + zap_hugepage_range(vma, + vma->vm_start + v_offset, + v_length - v_offset); } } @@ -391,6 +379,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, gid_t gid, int mode, dev_t dev) { struct inode *inode; + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); + + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return NULL; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } inode = new_inode(sb); if (inode) { @@ -532,51 +531,29 @@ static void hugetlbfs_put_super(struct super_block *sb) } } -static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) -{ - if (sbinfo->free_inodes >= 0) { - spin_lock(&sbinfo->stat_lock); - if (unlikely(!sbinfo->free_inodes)) { - spin_unlock(&sbinfo->stat_lock); - return 0; - } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); - } - - return 1; -} - -static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) -{ - if (sbinfo->free_inodes >= 0) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } -} - - static kmem_cache_t *hugetlbfs_inode_cachep; static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) { - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); struct hugetlbfs_inode_info *p; - if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) - return NULL; p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); - if (unlikely(!p)) { - hugetlbfs_inc_free_inodes(sbinfo); + if (!p) return NULL; - } return &p->vfs_inode; } +static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&ei->vfs_inode); +} + static void hugetlbfs_destroy_inode(struct inode *inode) { - hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -588,16 +565,6 @@ static struct address_space_operations hugetlbfs_aops = { .set_page_dirty = hugetlbfs_set_page_dirty, }; - -static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) -{ - struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; - - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&ei->vfs_inode); -} - struct file_operations hugetlbfs_file_operations = { .mmap = hugetlbfs_file_mmap, .fsync = simple_sync_file, @@ -625,7 +592,6 @@ static struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, .destroy_inode = hugetlbfs_destroy_inode, .statfs = hugetlbfs_statfs, - .delete_inode = hugetlbfs_delete_inode, .drop_inode = hugetlbfs_drop_inode, .put_super = hugetlbfs_put_super, }; diff --git a/trunk/fs/jfs/jfs_metapage.c b/trunk/fs/jfs/jfs_metapage.c index 8a53981f9f27..26091a5f88d4 100644 --- a/trunk/fs/jfs/jfs_metapage.c +++ b/trunk/fs/jfs/jfs_metapage.c @@ -86,7 +86,7 @@ struct meta_anchor { atomic_t io_count; struct metapage *mp[MPS_PER_PAGE]; }; -#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) +#define mp_anchor(page) ((struct meta_anchor *)page->private) static inline struct metapage *page_to_mp(struct page *page, uint offset) { @@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) if (!a) return -ENOMEM; memset(a, 0, sizeof(struct meta_anchor)); - set_page_private(page, (unsigned long)a); + page->private = (unsigned long)a; SetPagePrivate(page); kmap(page); } @@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) a->mp[index] = NULL; if (--a->mp_count == 0) { kfree(a); - set_page_private(page, 0); + page->private = 0; ClearPagePrivate(page); kunmap(page); } @@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *)) #else static inline struct metapage *page_to_mp(struct page *page, uint offset) { - return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; + return PagePrivate(page) ? (struct metapage *)page->private : NULL; } static inline int insert_metapage(struct page *page, struct metapage *mp) { if (mp) { - set_page_private(page, (unsigned long)mp); + page->private = (unsigned long)mp; SetPagePrivate(page); kmap(page); } @@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) static inline void remove_metapage(struct page *page, struct metapage *mp) { - set_page_private(page, 0); + page->private = 0; ClearPagePrivate(page); kunmap(page); } diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c index 3e1239e4b303..d84eecacbeaf 100644 --- a/trunk/fs/proc/array.c +++ b/trunk/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) jiffies_to_clock_t(it_real_value), start_time, vsize, - mm ? get_mm_rss(mm) : 0, + mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */ rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, diff --git a/trunk/fs/proc/task_mmu.c b/trunk/fs/proc/task_mmu.c index d2fa42006d8f..c7ef3e48e35b 100644 --- a/trunk/fs/proc/task_mmu.c +++ b/trunk/fs/proc/task_mmu.c @@ -14,41 +14,22 @@ char *task_mem(struct mm_struct *mm, char *buffer) { unsigned long data, text, lib; - unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - - /* - * Note: to minimize their overhead, mm maintains hiwater_vm and - * hiwater_rss only when about to *lower* total_vm or rss. Any - * collector of these hiwater stats must therefore get total_vm - * and rss too, which will usually be the higher. Barriers? not - * worth the effort, such snapshots can always be inconsistent. - */ - hiwater_vm = total_vm = mm->total_vm; - if (hiwater_vm < mm->hiwater_vm) - hiwater_vm = mm->hiwater_vm; - hiwater_rss = total_rss = get_mm_rss(mm); - if (hiwater_rss < mm->hiwater_rss) - hiwater_rss = mm->hiwater_rss; data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; buffer += sprintf(buffer, - "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), + get_mm_counter(mm, rss) << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); @@ -63,11 +44,13 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, file_rss); + int rss = get_mm_counter(mm, rss); + + *shared = rss - get_mm_counter(mm, anon_rss); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = rss; return mm->total_vm; } @@ -203,14 +186,13 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct mem_size_stats *mss) { pte_t *pte, ptent; - spinlock_t *ptl; unsigned long pfn; struct page *page; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + pte = pte_offset_map(pmd, addr); do { ptent = *pte; - if (!pte_present(ptent)) + if (pte_none(ptent) || !pte_present(ptent)) continue; mss->resident += PAGE_SIZE; @@ -231,8 +213,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, mss->private_clean += PAGE_SIZE; } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); + pte_unmap(pte - 1); + cond_resched_lock(&vma->vm_mm->page_table_lock); } static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -286,11 +268,17 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma, static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; + struct mm_struct *mm = vma->vm_mm; struct mem_size_stats mss; memset(&mss, 0, sizeof mss); - if (vma->vm_mm) + + if (mm) { + spin_lock(&mm->page_table_lock); smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); + spin_unlock(&mm->page_table_lock); + } + return show_map_internal(m, v, &mss); } @@ -419,6 +407,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) for_each_node(i) md->node[i] =0; + spin_lock(&mm->page_table_lock); for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { page = follow_page(mm, vaddr, 0); if (page) { @@ -433,8 +422,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) md->anon++; md->node[page_to_nid(page)]++; } - cond_resched(); } + spin_unlock(&mm->page_table_lock); return md; } @@ -480,7 +469,7 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " interleave={"); first = 1; for_each_node(n) { - if (node_isset(n, pol->v.nodes)) { + if (test_bit(n, pol->v.nodes)) { if (!first) seq_putc(m,','); else diff --git a/trunk/fs/xfs/linux-2.6/xfs_buf.c b/trunk/fs/xfs/linux-2.6/xfs_buf.c index 4cd46abe8434..ba4767c04adf 100644 --- a/trunk/fs/xfs/linux-2.6/xfs_buf.c +++ b/trunk/fs/xfs/linux-2.6/xfs_buf.c @@ -181,9 +181,8 @@ set_page_region( size_t offset, size_t length) { - set_page_private(page, - page_private(page) | page_region_mask(offset, length)); - if (page_private(page) == ~0UL) + page->private |= page_region_mask(offset, length); + if (page->private == ~0UL) SetPageUptodate(page); } @@ -195,7 +194,7 @@ test_page_region( { unsigned long mask = page_region_mask(offset, length); - return (mask && (page_private(page) & mask) == mask); + return (mask && (page->private & mask) == mask); } /* diff --git a/trunk/include/asm-alpha/barrier.h b/trunk/include/asm-alpha/barrier.h index 681ff581afa5..229c83fe77cb 100644 --- a/trunk/include/asm-alpha/barrier.h +++ b/trunk/include/asm-alpha/barrier.h @@ -1,8 +1,6 @@ #ifndef __BARRIER_H #define __BARRIER_H -#include - #define mb() \ __asm__ __volatile__("mb": : :"memory") diff --git a/trunk/include/asm-alpha/rwsem.h b/trunk/include/asm-alpha/rwsem.h index fafdd4f7010a..8e058a67c9a4 100644 --- a/trunk/include/asm-alpha/rwsem.h +++ b/trunk/include/asm-alpha/rwsem.h @@ -262,10 +262,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) #endif } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ALPHA_RWSEM_H */ diff --git a/trunk/include/asm-arm/tlb.h b/trunk/include/asm-arm/tlb.h index f49bfb78c221..9bb325c54645 100644 --- a/trunk/include/asm-arm/tlb.h +++ b/trunk/include/asm-arm/tlb.h @@ -27,7 +27,11 @@ */ struct mmu_gather { struct mm_struct *mm; + unsigned int freed; unsigned int fullmm; + + unsigned int flushes; + unsigned int avoided_flushes; }; DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -35,9 +39,11 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + int cpu = smp_processor_id(); + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); tlb->mm = mm; + tlb->freed = 0; tlb->fullmm = full_mm_flush; return tlb; @@ -46,13 +52,24 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { + struct mm_struct *mm = tlb->mm; + unsigned long freed = tlb->freed; + int rss = get_mm_counter(mm, rss); + + if (rss < freed) + freed = rss; + add_mm_counter(mm, rss, -freed); + if (tlb->fullmm) - flush_tlb_mm(tlb->mm); + flush_tlb_mm(mm); /* keep the page table cache within bounds */ check_pgt_cache(); +} - put_cpu_var(mmu_gathers); +static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb) +{ + return tlb->fullmm; } #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) diff --git a/trunk/include/asm-arm26/tlb.h b/trunk/include/asm-arm26/tlb.h index 08ddd85b8d35..1316352a58f3 100644 --- a/trunk/include/asm-arm26/tlb.h +++ b/trunk/include/asm-arm26/tlb.h @@ -10,20 +10,24 @@ */ struct mmu_gather { struct mm_struct *mm; - unsigned int need_flush; - unsigned int fullmm; + unsigned int freed; + unsigned int fullmm; + + unsigned int flushes; + unsigned int avoided_flushes; }; -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +extern struct mmu_gather mmu_gathers[NR_CPUS]; static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + int cpu = smp_processor_id(); + struct mmu_gather *tlb = &mmu_gathers[cpu]; tlb->mm = mm; - tlb->need_flush = 0; - tlb->fullmm = full_mm_flush; + tlb->freed = 0; + tlb->fullmm = full_mm_flush; return tlb; } @@ -31,13 +35,30 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - if (tlb->need_flush) - flush_tlb_mm(tlb->mm); + struct mm_struct *mm = tlb->mm; + unsigned long freed = tlb->freed; + int rss = get_mm_counter(mm, rss); + + if (rss < freed) + freed = rss; + add_mm_counter(mm, rss, -freed); + + if (freed) { + flush_tlb_mm(mm); + tlb->flushes++; + } else { + tlb->avoided_flushes++; + } /* keep the page table cache within bounds */ check_pgt_cache(); +} + - put_cpu_var(mmu_gathers); +static inline unsigned int +tlb_is_full_mm(struct mmu_gather *tlb) +{ + return tlb->fullmm; } #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) @@ -50,13 +71,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) } while (0) #define tlb_end_vma(tlb,vma) do { } while (0) -static inline void -tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - free_page_and_swap_cache(page); -} - +#define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) #define pte_free_tlb(tlb,ptep) pte_free(ptep) #define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp) diff --git a/trunk/include/asm-generic/4level-fixup.h b/trunk/include/asm-generic/4level-fixup.h index 68c6fea994d9..c20ec257ecc0 100644 --- a/trunk/include/asm-generic/4level-fixup.h +++ b/trunk/include/asm-generic/4level-fixup.h @@ -10,9 +10,14 @@ #define pud_t pgd_t -#define pmd_alloc(mm, pud, address) \ - ((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \ - NULL: pmd_offset(pud, address)) +#define pmd_alloc(mm, pud, address) \ +({ pmd_t *ret; \ + if (pgd_none(*pud)) \ + ret = __pmd_alloc(mm, pud, address); \ + else \ + ret = pmd_offset(pud, address); \ + ret; \ +}) #define pud_alloc(mm, pgd, address) (pgd) #define pud_offset(pgd, start) (pgd) diff --git a/trunk/include/asm-generic/pgtable.h b/trunk/include/asm-generic/pgtable.h index 7dca30a26c53..ff28c8b31f58 100644 --- a/trunk/include/asm-generic/pgtable.h +++ b/trunk/include/asm-generic/pgtable.h @@ -8,7 +8,7 @@ * - update the page tables * - inform the TLB about the new one * - * We hold the mm semaphore for reading, and the pte lock. + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock. * * Note: the old pte is known to not be writable, so we don't need to * worry about dirty bits etc getting lost. diff --git a/trunk/include/asm-generic/tlb.h b/trunk/include/asm-generic/tlb.h index cdd4145243cd..7d0298347ee7 100644 --- a/trunk/include/asm-generic/tlb.h +++ b/trunk/include/asm-generic/tlb.h @@ -35,13 +35,16 @@ #endif /* struct mmu_gather is an opaque type used by the mm code for passing around - * any data needed by arch specific code for tlb_remove_page. + * any data needed by arch specific code for tlb_remove_page. This structure + * can be per-CPU or per-MM as the page table lock is held for the duration of + * TLB shootdown. */ struct mmu_gather { struct mm_struct *mm; unsigned int nr; /* set to ~0U means fast mode */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ + unsigned long freed; struct page * pages[FREE_PTE_NR]; }; @@ -54,7 +57,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id()); tlb->mm = mm; @@ -62,6 +65,7 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; tlb->fullmm = full_mm_flush; + tlb->freed = 0; return tlb; } @@ -81,17 +85,28 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources - * that were required. + * that were required. The page table lock is still held at this point. */ static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { + int freed = tlb->freed; + struct mm_struct *mm = tlb->mm; + int rss = get_mm_counter(mm, rss); + + if (rss < freed) + freed = rss; + add_mm_counter(mm, rss, -freed); tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ check_pgt_cache(); +} - put_cpu_var(mmu_gathers); +static inline unsigned int +tlb_is_full_mm(struct mmu_gather *tlb) +{ + return tlb->fullmm; } /* tlb_remove_page diff --git a/trunk/include/asm-i386/mmzone.h b/trunk/include/asm-i386/mmzone.h index 620a90641ea8..348fe3a4879d 100644 --- a/trunk/include/asm-i386/mmzone.h +++ b/trunk/include/asm-i386/mmzone.h @@ -88,6 +88,12 @@ static inline int pfn_to_nid(unsigned long pfn) __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) +#define local_mapnr(kvaddr) \ +({ \ + unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ + (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ +}) + /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) diff --git a/trunk/include/asm-i386/pgtable.h b/trunk/include/asm-i386/pgtable.h index 0e3ec809352d..d101ac414f07 100644 --- a/trunk/include/asm-i386/pgtable.h +++ b/trunk/include/asm-i386/pgtable.h @@ -203,8 +203,7 @@ extern unsigned long pg0[]; #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) -/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ -#define pmd_none(x) (!(unsigned long)pmd_val(x)) +#define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) diff --git a/trunk/include/asm-i386/rwsem.h b/trunk/include/asm-i386/rwsem.h index be4ab859238e..7625a675852f 100644 --- a/trunk/include/asm-i386/rwsem.h +++ b/trunk/include/asm-i386/rwsem.h @@ -284,10 +284,5 @@ LOCK_PREFIX "xadd %0,(%2)" return tmp+delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _I386_RWSEM_H */ diff --git a/trunk/include/asm-ia64/rwsem.h b/trunk/include/asm-ia64/rwsem.h index 1327c91ea39c..e18b5ab0cb75 100644 --- a/trunk/include/asm-ia64/rwsem.h +++ b/trunk/include/asm-ia64/rwsem.h @@ -186,9 +186,4 @@ __downgrade_write (struct rw_semaphore *sem) #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* _ASM_IA64_RWSEM_H */ diff --git a/trunk/include/asm-ia64/tlb.h b/trunk/include/asm-ia64/tlb.h index 834370b9dea1..3a9a6d1be75c 100644 --- a/trunk/include/asm-ia64/tlb.h +++ b/trunk/include/asm-ia64/tlb.h @@ -60,6 +60,7 @@ struct mmu_gather { unsigned int nr; /* == ~0U => fast mode */ unsigned char fullmm; /* non-zero means full mm flush */ unsigned char need_flush; /* really unmapped some PTEs? */ + unsigned long freed; /* number of pages freed */ unsigned long start_addr; unsigned long end_addr; struct page *pages[FREE_PTE_NR]; @@ -128,7 +129,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e static inline struct mmu_gather * tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers); tlb->mm = mm; /* @@ -146,17 +147,25 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) */ tlb->nr = (num_online_cpus() == 1) ? ~0U : 0; tlb->fullmm = full_mm_flush; + tlb->freed = 0; tlb->start_addr = ~0UL; return tlb; } /* * Called at the end of the shootdown operation to free up any resources that were - * collected. + * collected. The page table lock is still held at this point. */ static inline void tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) { + unsigned long freed = tlb->freed; + struct mm_struct *mm = tlb->mm; + unsigned long rss = get_mm_counter(mm, rss); + + if (rss < freed) + freed = rss; + add_mm_counter(mm, rss, -freed); /* * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and * tlb->end_addr. @@ -165,8 +174,12 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); +} - put_cpu_var(mmu_gathers); +static inline unsigned int +tlb_is_full_mm(struct mmu_gather *tlb) +{ + return tlb->fullmm; } /* diff --git a/trunk/include/asm-m32r/mmzone.h b/trunk/include/asm-m32r/mmzone.h index adc7970a77ec..d58878ec899e 100644 --- a/trunk/include/asm-m32r/mmzone.h +++ b/trunk/include/asm-m32r/mmzone.h @@ -21,6 +21,12 @@ extern struct pglist_data *node_data[]; __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) +#define local_mapnr(kvaddr) \ +({ \ + unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ + (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ +}) + #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ diff --git a/trunk/include/asm-parisc/cacheflush.h b/trunk/include/asm-parisc/cacheflush.h index 1bc3c83ee74b..aa592d8c0e39 100644 --- a/trunk/include/asm-parisc/cacheflush.h +++ b/trunk/include/asm-parisc/cacheflush.h @@ -100,34 +100,30 @@ static inline void flush_cache_range(struct vm_area_struct *vma, /* Simple function to work out if we have an existing address translation * for a user space vma. */ -static inline int translation_exists(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn) +static inline pte_t *__translation_exists(struct mm_struct *mm, + unsigned long addr) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pgd_t *pgd = pgd_offset(mm, addr); pmd_t *pmd; - pte_t pte; + pte_t *pte; if(pgd_none(*pgd)) - return 0; + return NULL; pmd = pmd_offset(pgd, addr); if(pmd_none(*pmd) || pmd_bad(*pmd)) - return 0; + return NULL; - /* We cannot take the pte lock here: flush_cache_page is usually - * called with pte lock already held. Whereas flush_dcache_page - * takes flush_dcache_mmap_lock, which is lower in the hierarchy: - * the vma itself is secure, but the pte might come or go racily. - */ - pte = *pte_offset_map(pmd, addr); - /* But pte_unmap() does nothing on this architecture */ + pte = pte_offset_map(pmd, addr); - /* Filter out coincidental file entries and swap entries */ - if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT))) - return 0; - - return pte_pfn(pte) == pfn; + /* The PA flush mappings show up as pte_none, but they're + * valid none the less */ + if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0)) + return NULL; + return pte; } +#define translation_exists(vma, addr) __translation_exists((vma)->vm_mm, addr) + /* Private function to flush a page from the cache of a non-current * process. cr25 contains the Page Directory of the current user @@ -179,8 +175,9 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long { BUG_ON(!vma->vm_mm->context); - if (likely(translation_exists(vma, vmaddr, pfn))) + if(likely(translation_exists(vma, vmaddr))) __flush_cache_page(vma, vmaddr); } #endif + diff --git a/trunk/include/asm-parisc/mmzone.h b/trunk/include/asm-parisc/mmzone.h index ae039f4fd711..595d3dce120a 100644 --- a/trunk/include/asm-parisc/mmzone.h +++ b/trunk/include/asm-parisc/mmzone.h @@ -27,6 +27,12 @@ extern struct node_map_data node_data[]; }) #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) +#define local_mapnr(kvaddr) \ +({ \ + unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ + (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ +}) + #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ diff --git a/trunk/include/asm-parisc/tlbflush.h b/trunk/include/asm-parisc/tlbflush.h index e97aa8d1eff5..84af4ab1fe51 100644 --- a/trunk/include/asm-parisc/tlbflush.h +++ b/trunk/include/asm-parisc/tlbflush.h @@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, if (npages >= 512) /* 2MB of space: arbitrary, should be tuned */ flush_tlb_all(); else { - preempt_disable(); + mtsp(vma->vm_mm->context,1); purge_tlb_start(); if (split_tlb) { @@ -102,7 +102,6 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, pdtlb(start); start += PAGE_SIZE; } - preempt_enable(); } purge_tlb_end(); } diff --git a/trunk/include/asm-ppc/rwsem.h b/trunk/include/asm-ppc/rwsem.h index 3501ea72f88c..3e738f483c11 100644 --- a/trunk/include/asm-ppc/rwsem.h +++ b/trunk/include/asm-ppc/rwsem.h @@ -168,10 +168,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _PPC_RWSEM_XADD_H */ diff --git a/trunk/include/asm-ppc64/mmzone.h b/trunk/include/asm-ppc64/mmzone.h index 80a708e7093a..ed473f4b0152 100644 --- a/trunk/include/asm-ppc64/mmzone.h +++ b/trunk/include/asm-ppc64/mmzone.h @@ -67,6 +67,9 @@ static inline int pa_to_nid(unsigned long pa) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) +#define local_mapnr(kvaddr) \ + ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) + #ifdef CONFIG_DISCONTIGMEM /* diff --git a/trunk/include/asm-ppc64/pgtable.h b/trunk/include/asm-ppc64/pgtable.h index 2eb1778a3a15..c83679c9d2b0 100644 --- a/trunk/include/asm-ppc64/pgtable.h +++ b/trunk/include/asm-ppc64/pgtable.h @@ -478,12 +478,10 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) diff --git a/trunk/include/asm-ppc64/rwsem.h b/trunk/include/asm-ppc64/rwsem.h index 7a647fae3765..bd5c2f093575 100644 --- a/trunk/include/asm-ppc64/rwsem.h +++ b/trunk/include/asm-ppc64/rwsem.h @@ -163,10 +163,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _PPC_RWSEM_XADD_H */ diff --git a/trunk/include/asm-s390/rwsem.h b/trunk/include/asm-s390/rwsem.h index 0422a085dd56..8c0cebbfc034 100644 --- a/trunk/include/asm-s390/rwsem.h +++ b/trunk/include/asm-s390/rwsem.h @@ -351,10 +351,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return new; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _S390_RWSEM_H */ diff --git a/trunk/include/asm-sh/rwsem.h b/trunk/include/asm-sh/rwsem.h index 0262d3d1e5e0..1be4337f5259 100644 --- a/trunk/include/asm-sh/rwsem.h +++ b/trunk/include/asm-sh/rwsem.h @@ -166,10 +166,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ASM_SH_RWSEM_H */ diff --git a/trunk/include/asm-sparc64/rwsem.h b/trunk/include/asm-sparc64/rwsem.h index cef5e8270421..4568ee4022df 100644 --- a/trunk/include/asm-sparc64/rwsem.h +++ b/trunk/include/asm-sparc64/rwsem.h @@ -56,11 +56,6 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) atomic_add(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _SPARC64_RWSEM_H */ diff --git a/trunk/include/asm-sparc64/tlb.h b/trunk/include/asm-sparc64/tlb.h index 66138d959df5..9baf57db01d2 100644 --- a/trunk/include/asm-sparc64/tlb.h +++ b/trunk/include/asm-sparc64/tlb.h @@ -25,8 +25,9 @@ struct mmu_gather { struct mm_struct *mm; unsigned int pages_nr; unsigned int need_flush; - unsigned int fullmm; + unsigned int tlb_frozen; unsigned int tlb_nr; + unsigned long freed; unsigned long vaddrs[TLB_BATCH_NR]; struct page *pages[FREE_PTE_NR]; }; @@ -43,13 +44,14 @@ extern void flush_tlb_pending(void); static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *mp = &get_cpu_var(mmu_gathers); + struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); BUG_ON(mp->tlb_nr); mp->mm = mm; mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U; - mp->fullmm = full_mm_flush; + mp->tlb_frozen = full_mm_flush; + mp->freed = 0; return mp; } @@ -76,19 +78,30 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm); static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end) { + unsigned long freed = mp->freed; + struct mm_struct *mm = mp->mm; + unsigned long rss = get_mm_counter(mm, rss); + + if (rss < freed) + freed = rss; + add_mm_counter(mm, rss, -freed); + tlb_flush_mmu(mp); - if (mp->fullmm) { - if (CTX_VALID(mp->mm->context)) - do_flush_tlb_mm(mp->mm); - mp->fullmm = 0; + if (mp->tlb_frozen) { + if (CTX_VALID(mm->context)) + do_flush_tlb_mm(mm); + mp->tlb_frozen = 0; } else flush_tlb_pending(); /* keep the page table cache within bounds */ check_pgt_cache(); +} - put_cpu_var(mmu_gathers); +static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp) +{ + return mp->tlb_frozen; } static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) diff --git a/trunk/include/asm-um/pgtable.h b/trunk/include/asm-um/pgtable.h index ac64eb955868..616d02b57ea9 100644 --- a/trunk/include/asm-um/pgtable.h +++ b/trunk/include/asm-um/pgtable.h @@ -138,7 +138,7 @@ extern unsigned long pg0[1024]; #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) -#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) diff --git a/trunk/include/asm-x86_64/rwsem.h b/trunk/include/asm-x86_64/rwsem.h index 46077e9c1910..c002175b6e82 100644 --- a/trunk/include/asm-x86_64/rwsem.h +++ b/trunk/include/asm-x86_64/rwsem.h @@ -274,10 +274,5 @@ LOCK_PREFIX "xaddl %0,(%2)" return tmp+delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _X8664_RWSEM_H */ diff --git a/trunk/include/linux/buffer_head.h b/trunk/include/linux/buffer_head.h index c937d6e65502..88af42f5e04a 100644 --- a/trunk/include/linux/buffer_head.h +++ b/trunk/include/linux/buffer_head.h @@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ - BUG_ON(!PagePrivate(page)); \ - ((struct buffer_head *)page_private(page)); \ + BUG_ON(!PagePrivate(page)); \ + ((struct buffer_head *)(page)->private); \ }) #define page_has_buffers(page) PagePrivate(page) @@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, { page_cache_get(page); SetPagePrivate(page); - set_page_private(page, (unsigned long)head); + page->private = (unsigned long)head; } static inline void get_bh(struct buffer_head *bh) diff --git a/trunk/include/linux/hugetlb.h b/trunk/include/linux/hugetlb.h index 0cea162b08c0..d664330d900e 100644 --- a/trunk/include/linux/hugetlb.h +++ b/trunk/include/linux/hugetlb.h @@ -16,6 +16,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); +void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); @@ -86,6 +87,7 @@ static inline unsigned long hugetlb_total_pages(void) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) +#define zap_hugepage_range(vma, start, len) BUG() #define unmap_hugepage_range(vma, start, end) BUG() #define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 diff --git a/trunk/include/linux/memory.h b/trunk/include/linux/memory.h deleted file mode 100644 index 0def328ab5cf..000000000000 --- a/trunk/include/linux/memory.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * include/linux/memory.h - generic memory definition - * - * This is mainly for topological representation. We define the - * basic "struct memory_block" here, which can be embedded in per-arch - * definitions or NUMA information. - * - * Basic handling of the devices is done in drivers/base/memory.c - * and system devices are handled in drivers/base/sys.c. - * - * Memory block are exported via sysfs in the class/memory/devices/ - * directory. - * - */ -#ifndef _LINUX_MEMORY_H_ -#define _LINUX_MEMORY_H_ - -#include -#include -#include - -#include - -struct memory_block { - unsigned long phys_index; - unsigned long state; - /* - * This serializes all state change requests. It isn't - * held during creation because the control files are - * created long after the critical areas during - * initialization. - */ - struct semaphore state_sem; - int phys_device; /* to which fru does this belong? */ - void *hw; /* optional pointer to fw/hw data */ - int (*phys_callback)(struct memory_block *); - struct sys_device sysdev; -}; - -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ - -/* - * All of these states are currently kernel-internal for notifying - * kernel components and architectures. - * - * For MEM_MAPPING_INVALID, all notifier chains with priority >0 - * are called before pfn_to_page() becomes invalid. The priority=0 - * entry is reserved for the function that actually makes - * pfn_to_page() stop working. Any notifiers that want to be called - * after that should have priority <0. - */ -#define MEM_MAPPING_INVALID (1<<3) - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline int memory_dev_init(void) -{ - return 0; -} -static inline int register_memory_notifier(struct notifier_block *nb) -{ - return 0; -} -static inline void unregister_memory_notifier(struct notifier_block *nb) -{ -} -#else -extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); -extern int register_new_memory(struct mem_section *); -extern int unregister_memory_section(struct mem_section *); -extern int memory_dev_init(void); -extern int register_memory_notifier(struct notifier_block *nb); -extern void unregister_memory_notifier(struct notifier_block *nb); - -#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< -#include -#include -#include - -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * pgdat resizing functions - */ -static inline -void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) -{ - spin_lock_irqsave(&pgdat->node_size_lock, *flags); -} -static inline -void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) -{ - spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); -} -static inline -void pgdat_resize_init(struct pglist_data *pgdat) -{ - spin_lock_init(&pgdat->node_size_lock); -} -/* - * Zone resizing functions - */ -static inline unsigned zone_span_seqbegin(struct zone *zone) -{ - return read_seqbegin(&zone->span_seqlock); -} -static inline int zone_span_seqretry(struct zone *zone, unsigned iv) -{ - return read_seqretry(&zone->span_seqlock, iv); -} -static inline void zone_span_writelock(struct zone *zone) -{ - write_seqlock(&zone->span_seqlock); -} -static inline void zone_span_writeunlock(struct zone *zone) -{ - write_sequnlock(&zone->span_seqlock); -} -static inline void zone_seqlock_init(struct zone *zone) -{ - seqlock_init(&zone->span_seqlock); -} -extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); -extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); -extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -/* need some defines for these for archs that don't support it */ -extern void online_page(struct page *page); -/* VM interface that may be used by firmware interface */ -extern int add_memory(u64 start, u64 size); -extern int remove_memory(u64 start, u64 size); -extern int online_pages(unsigned long, unsigned long); - -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); -#else /* ! CONFIG_MEMORY_HOTPLUG */ -/* - * Stub functions for when hotplug is off - */ -static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} -static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} -static inline void pgdat_resize_init(struct pglist_data *pgdat) {} - -static inline unsigned zone_span_seqbegin(struct zone *zone) -{ - return 0; -} -static inline int zone_span_seqretry(struct zone *zone, unsigned iv) -{ - return 0; -} -static inline void zone_span_writelock(struct zone *zone) {} -static inline void zone_span_writeunlock(struct zone *zone) {} -static inline void zone_seqlock_init(struct zone *zone) {} - -static inline int mhp_notimplemented(const char *func) -{ - printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); - dump_stack(); - return -ENOSYS; -} - -static inline int __add_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages) -{ - return mhp_notimplemented(__FUNCTION__); -} -#endif /* ! CONFIG_MEMORY_HOTPLUG */ -static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages) -{ - printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); - dump_stack(); - return -ENOSYS; -} -#endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/trunk/include/linux/mempolicy.h b/trunk/include/linux/mempolicy.h index 7af8cb836e78..58385ee1c0ac 100644 --- a/trunk/include/linux/mempolicy.h +++ b/trunk/include/linux/mempolicy.h @@ -27,10 +27,10 @@ #include #include +#include #include #include #include -#include struct vm_area_struct; @@ -47,7 +47,8 @@ struct vm_area_struct; * Locking policy for interlave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on - * mmap_sem. + * mmap_sem. For allocating in the interleave policy the page_table_lock + * must be also aquired to protect il_next. * * Freeing policy: * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. @@ -62,7 +63,7 @@ struct mempolicy { union { struct zonelist *zonelist; /* bind */ short preferred_node; /* preferred */ - nodemask_t nodes; /* interleave */ + DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */ /* undefined for default */ } v; }; diff --git a/trunk/include/linux/mm.h b/trunk/include/linux/mm.h index 5c1fb0a2e806..e1649578fb0c 100644 --- a/trunk/include/linux/mm.h +++ b/trunk/include/linux/mm.h @@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Pages managed in a special way */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ @@ -226,18 +226,13 @@ struct page { * to show when page is mapped * & limit reverse map searches. */ - union { - unsigned long private; /* Mapping-private opaque data: + unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache * When page is free, this indicates * order in the buddy system. */ -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS - spinlock_t ptl; -#endif - } u; struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous @@ -265,9 +260,6 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; -#define page_private(page) ((page)->u.private) -#define set_page_private(page, v) ((page)->u.private = (v)) - /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -319,17 +311,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); #ifdef CONFIG_HUGETLB_PAGE -static inline int page_count(struct page *page) +static inline int page_count(struct page *p) { - if (PageCompound(page)) - page = (struct page *)page_private(page); - return atomic_read(&page->_count) + 1; + if (PageCompound(p)) + p = (struct page *)p->private; + return atomic_read(&(p)->_count) + 1; } static inline void get_page(struct page *page) { if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); + page = (struct page *)page->private; atomic_inc(&page->_count); } @@ -346,7 +338,7 @@ static inline void get_page(struct page *page) static inline void put_page(struct page *page) { - if (put_page_testzero(page)) + if (!PageReserved(page) && put_page_testzero(page)) __page_cache_release(page); } @@ -595,7 +587,7 @@ static inline int PageAnon(struct page *page) static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return page_private(page); + return page->private; return page->index; } @@ -690,7 +682,7 @@ struct zap_details { unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, +unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); @@ -712,6 +704,10 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, } extern int vmtruncate(struct inode * inode, loff_t offset); +extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -727,7 +723,6 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); -void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); @@ -764,83 +759,38 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); -int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); - +/* + * On a two-level or three-level page table, this ends up being trivial. Thus + * the inlining and the symmetry break with pte_alloc_map() that does all + * of this out-of-line. + */ /* * The following ifdef needed to get the 4level-fixup.h header to work. * Remove it when 4level-fixup.h has been removed. */ -#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) +#ifdef CONFIG_MMU +#ifndef __ARCH_HAS_4LEVEL_HACK static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? - NULL: pud_offset(pgd, address); + if (pgd_none(*pgd)) + return __pud_alloc(mm, pgd, address); + return pud_offset(pgd, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? - NULL: pmd_offset(pud, address); + if (pud_none(*pud)) + return __pmd_alloc(mm, pud, address); + return pmd_offset(pud, address); } -#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ - -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS -/* - * We tuck a spinlock to guard each pagetable page into its struct page, - * at page->private, with BUILD_BUG_ON to make sure that this will not - * overflow into the next struct page (as it might with DEBUG_SPINLOCK). - * When freeing, reset page->mapping so free_pages_check won't complain. - */ -#define __pte_lockptr(page) &((page)->u.ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) -#define pte_lock_deinit(page) ((page)->mapping = NULL) -#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) -#else -/* - * We use mm->page_table_lock to guard all pagetable pages of the mm. - */ -#define pte_lock_init(page) do {} while (0) -#define pte_lock_deinit(page) do {} while (0) -#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) -#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ - -#define pte_offset_map_lock(mm, pmd, address, ptlp) \ -({ \ - spinlock_t *__ptl = pte_lockptr(mm, pmd); \ - pte_t *__pte = pte_offset_map(pmd, address); \ - *(ptlp) = __ptl; \ - spin_lock(__ptl); \ - __pte; \ -}) - -#define pte_unmap_unlock(pte, ptl) do { \ - spin_unlock(ptl); \ - pte_unmap(pte); \ -} while (0) - -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) - -#define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) - -#define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ - NULL: pte_offset_kernel(pmd, address)) +#endif +#endif /* CONFIG_MMU */ extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); -extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); @@ -884,7 +834,6 @@ extern int split_vma(struct mm_struct *, extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); -extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); extern void exit_mmap(struct mm_struct *); @@ -945,8 +894,7 @@ void handle_ra_miss(struct address_space *mapping, unsigned long max_sane_readahead(unsigned long nr); /* Do stack extension */ -extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); @@ -969,28 +917,40 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); -struct page *vmalloc_to_page(void *addr); -unsigned long vmalloc_to_pfn(void *addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); -struct page *follow_page(struct mm_struct *, unsigned long address, - unsigned int foll_flags); -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ +extern struct page * vmalloc_to_page(void *addr); +extern unsigned long vmalloc_to_pfn(void *addr); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, + int write); +extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); +int remap_pfn_range(struct vm_area_struct *, unsigned long, + unsigned long, unsigned long, pgprot_t); #ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); +void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); #else -static inline void vm_stat_account(struct mm_struct *mm, +static inline void __vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { } #endif /* CONFIG_PROC_FS */ +static inline void vm_stat_account(struct vm_area_struct *vma) +{ + __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, + vma_pages(vma)); +} + +static inline void vm_stat_unaccount(struct vm_area_struct *vma) +{ + __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, + -vma_pages(vma)); +} + +/* update per process rss and vm hiwater data */ +extern void update_mem_hiwater(struct task_struct *tsk); + #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/trunk/include/linux/mmzone.h b/trunk/include/linux/mmzone.h index f5fa3082fd6a..7519eb4191e7 100644 --- a/trunk/include/linux/mmzone.h +++ b/trunk/include/linux/mmzone.h @@ -12,7 +12,6 @@ #include #include #include -#include #include /* Free memory management - zoned buddy allocator. */ @@ -138,10 +137,6 @@ struct zone { * free areas of different sizes */ spinlock_t lock; -#ifdef CONFIG_MEMORY_HOTPLUG - /* see spanned/present_pages for more description */ - seqlock_t span_seqlock; -#endif struct free_area free_area[MAX_ORDER]; @@ -225,16 +220,6 @@ struct zone { /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; - /* - * zone_start_pfn, spanned_pages and present_pages are all - * protected by span_seqlock. It is a seqlock because it has - * to be read outside of zone->lock, and it is done in the main - * allocator path. But, it is written quite infrequently. - * - * The lock is declared along with zone->lock because it is - * frequently read in proximity to zone->lock. It's good to - * give them a chance of being in the same cacheline. - */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ @@ -288,16 +273,6 @@ typedef struct pglist_data { struct page *node_mem_map; #endif struct bootmem_data *bdata; -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Must be held any time you expect node_start_pfn, node_present_pages - * or node_spanned_pages stay constant. Holding this will also - * guarantee that any pfn_valid() stays that way. - * - * Nests above zone->lock and zone->size_seqlock. - */ - spinlock_t node_size_lock; -#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page @@ -318,8 +293,6 @@ typedef struct pglist_data { #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) -#include - extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, @@ -536,7 +509,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } -extern int __section_nr(struct mem_section* ms); /* * We use the lower bits of the mem_map pointer to store diff --git a/trunk/include/linux/rmap.h b/trunk/include/linux/rmap.h index 35b30e6c8cf8..e80fb7ee6efd 100644 --- a/trunk/include/linux/rmap.h +++ b/trunk/include/linux/rmap.h @@ -95,8 +95,8 @@ int try_to_unmap(struct page *); /* * Called from mm/filemap_xip.c to unmap empty zero page */ -pte_t *page_check_address(struct page *, struct mm_struct *, - unsigned long, spinlock_t **); +pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long); + /* * Used by swapoff to help locate where page is expected in vma. diff --git a/trunk/include/linux/rwsem-spinlock.h b/trunk/include/linux/rwsem-spinlock.h index f30f805080ae..b52a2af25f1f 100644 --- a/trunk/include/linux/rwsem-spinlock.h +++ b/trunk/include/linux/rwsem-spinlock.h @@ -61,10 +61,5 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem)); extern void FASTCALL(__up_write(struct rw_semaphore *sem)); extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->activity != 0); -} - #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ diff --git a/trunk/include/linux/scatterlist.h b/trunk/include/linux/scatterlist.h index 7f717e95ae37..66ff545552f7 100644 --- a/trunk/include/linux/scatterlist.h +++ b/trunk/include/linux/scatterlist.h @@ -1,14 +1,23 @@ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H -static inline void sg_init_one(struct scatterlist *sg, - u8 *buf, unsigned int buflen) -{ - memset(sg, 0, sizeof(*sg)); +#include +#include +#include +static inline void sg_set_buf(struct scatterlist *sg, void *buf, + unsigned int buflen) +{ sg->page = virt_to_page(buf); sg->offset = offset_in_page(buf); sg->length = buflen; } +static inline void sg_init_one(struct scatterlist *sg, void *buf, + unsigned int buflen) +{ + memset(sg, 0, sizeof(*sg)); + sg_set_buf(sg, buf, buflen); +} + #endif /* _LINUX_SCATTERLIST_H */ diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index 1c30bc308ef1..27519df0f987 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -249,36 +249,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS -/* - * The mm counters are not protected by its page_table_lock, - * so must be incremented atomically. - */ -#ifdef ATOMIC64_INIT -#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) -typedef atomic64_t mm_counter_t; -#else /* !ATOMIC64_INIT */ -/* - * The counters wrap back to 0 at 2^32 * PAGE_SIZE, - * that is, at 16TB if using 4kB page size. - */ -#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) -typedef atomic_t mm_counter_t; -#endif /* !ATOMIC64_INIT */ - -#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ -/* - * The mm counters are protected by its page_table_lock, - * so can be incremented directly. - */ #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) #define add_mm_counter(mm, member, value) (mm)->_##member += (value) @@ -286,20 +256,6 @@ typedef atomic_t mm_counter_t; #define dec_mm_counter(mm, member) (mm)->_##member-- typedef unsigned long mm_counter_t; -#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ - -#define get_mm_rss(mm) \ - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) -#define update_hiwater_rss(mm) do { \ - unsigned long _rss = get_mm_rss(mm); \ - if ((mm)->hiwater_rss < _rss) \ - (mm)->hiwater_rss = _rss; \ -} while (0) -#define update_hiwater_vm(mm) do { \ - if ((mm)->hiwater_vm < (mm)->total_vm) \ - (mm)->hiwater_vm = (mm)->total_vm; \ -} while (0) - struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -323,20 +279,15 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - mm_counter_t _file_rss; - mm_counter_t _anon_rss; - - unsigned long hiwater_rss; /* High-watermark of RSS usage */ - unsigned long hiwater_vm; /* High-water virtual memory usage */ - - unsigned long total_vm, locked_vm, shared_vm, exec_vm; - unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; + unsigned long total_vm, locked_vm, shared_vm; + unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; + + /* Special counters protected by the page_table_lock */ + mm_counter_t _rss; + mm_counter_t _anon_rss; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ @@ -357,7 +308,11 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; + struct kioctx default_kioctx; + + unsigned long hiwater_rss; /* High-water RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ }; struct sighand_struct { diff --git a/trunk/include/linux/vmalloc.h b/trunk/include/linux/vmalloc.h index 1d5577b2b752..3701a0673d2c 100644 --- a/trunk/include/linux/vmalloc.h +++ b/trunk/include/linux/vmalloc.h @@ -32,14 +32,10 @@ struct vm_struct { * Highlevel APIs for driver use */ extern void *vmalloc(unsigned long size); -extern void *vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); -extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, - pgprot_t prot, int node); +extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); extern void vfree(void *addr); extern void *vmap(struct page **pages, unsigned int count, @@ -52,8 +48,6 @@ extern void vunmap(void *addr); extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node); extern struct vm_struct *remove_vm_area(void *addr); extern struct vm_struct *__remove_vm_area(void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, diff --git a/trunk/ipc/shm.c b/trunk/ipc/shm.c index b58c651d31ae..dca90489e3b0 100644 --- a/trunk/ipc/shm.c +++ b/trunk/ipc/shm.c @@ -233,11 +233,10 @@ static int newseg (key_t key, int shmflg, size_t size) shp->id = shm_buildid(id,shp->shm_perm.seq); shp->shm_file = file; file->f_dentry->d_inode->i_ino = shp->id; - - /* Hugetlb ops would have already been assigned. */ - if (!(shmflg & SHM_HUGETLB)) + if (shmflg & SHM_HUGETLB) + set_file_hugepages(file); + else file->f_op = &shm_file_operations; - shm_tot += numpages; shm_unlock(shp); return shp->id; diff --git a/trunk/kernel/acct.c b/trunk/kernel/acct.c index 2e3f4a47e7d0..b756f527497e 100644 --- a/trunk/kernel/acct.c +++ b/trunk/kernel/acct.c @@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk) if (delta == 0) return; tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); + tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } } diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index 79f52b85d6ed..3b25b182d2be 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -839,10 +839,7 @@ fastcall NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { del_timer_sync(&tsk->signal->real_timer); diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 8a069612eac3..280bd44ac441 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } #ifdef CONFIG_MMU -static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) { - struct vm_area_struct *mpnt, *tmp, **pprev; + struct vm_area_struct * mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(oldmm); - down_write(&mm->mmap_sem); - + flush_cache_mm(current->mm); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; + set_mm_counter(mm, rss, 0); + set_mm_counter(mm, anon_rss, 0); cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } @@ -253,8 +253,12 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } /* - * Link in the new vma and copy the page table entries. + * Link in the new vma and copy the page table entries: + * link in first so that swapoff can see swap entries. + * Note that, exceptionally, here the vma is inserted + * without holding mm->mmap_sem. */ + spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; @@ -263,7 +267,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, tmp); + retval = copy_page_range(mm, current->mm, tmp); + spin_unlock(&mm->page_table_lock); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -272,9 +277,9 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto out; } retval = 0; + out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); + flush_tlb_mm(current->mm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: @@ -318,8 +323,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; @@ -496,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (retval) goto free_pt; - mm->hiwater_rss = get_mm_rss(mm); + mm->hiwater_rss = get_mm_counter(mm,rss); mm->hiwater_vm = mm->total_vm; good_mm: diff --git a/trunk/kernel/futex.c b/trunk/kernel/futex.c index 3b4d5ad44cc6..ca05fe6a70b2 100644 --- a/trunk/kernel/futex.c +++ b/trunk/kernel/futex.c @@ -205,13 +205,15 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * Do a quick atomic lookup first - this is the fastpath. */ - page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); + spin_lock(¤t->mm->page_table_lock); + page = follow_page(mm, uaddr, 0); if (likely(page != NULL)) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - put_page(page); + spin_unlock(¤t->mm->page_table_lock); return 0; } + spin_unlock(¤t->mm->page_table_lock); /* * Do it the general way. diff --git a/trunk/kernel/kexec.c b/trunk/kernel/kexec.c index 2c95848fbce8..36c5d9cd4cc1 100644 --- a/trunk/kernel/kexec.c +++ b/trunk/kernel/kexec.c @@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) if (pages) { unsigned int count, i; pages->mapping = NULL; - set_page_private(pages, order); + pages->private = order; count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); @@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page) { unsigned int order, count, i; - order = page_private(page); + order = page->private; count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); diff --git a/trunk/kernel/power/swsusp.c b/trunk/kernel/power/swsusp.c index 016504ccfccf..10bc5ec496d7 100644 --- a/trunk/kernel/power/swsusp.c +++ b/trunk/kernel/power/swsusp.c @@ -578,23 +578,15 @@ static int save_highmem_zone(struct zone *zone) continue; page = pfn_to_page(pfn); /* - * PageReserved results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. - * - * rvmalloc should not cause this, because all implementations - * appear to always be using vmalloc_32 on architectures with - * highmem. This is a good thing, because we would like to save - * rvmalloc pages. - * - * It appears to be triggered by pages which do not point to - * valid memory (see arch/i386/mm/init.c:one_highpage_init(), - * which sets PageReserved if the page does not point to valid - * RAM. - * - * XXX: must remove usage of PageReserved! + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. */ - if (PageReserved(page)) + if (PageReserved(page)) { + printk("highmem reserved page?!\n"); continue; + } BUG_ON(PageNosave(page)); if (PageNosaveFree(page)) continue; @@ -680,9 +672,10 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn) return 0; page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; - if (pfn_is_nosave(pfn)) { + if (PageReserved(page) && pfn_is_nosave(pfn)) { pr_debug("[nosave pfn 0x%lx]", pfn); return 0; } diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index 4f26c544d02c..1e5cafdf4e27 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -2511,6 +2511,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->idle = cputime64_add(cpustat->idle, tmp); /* Account for system time used */ acct_update_integrals(p); + /* Update rss highwater mark */ + update_mem_hiwater(p); } /* diff --git a/trunk/kernel/timer.c b/trunk/kernel/timer.c index 6a2e5f8dc725..3ba10fa35b60 100644 --- a/trunk/kernel/timer.c +++ b/trunk/kernel/timer.c @@ -752,15 +752,6 @@ static void second_overflow(void) else time_adj += (time_adj >> 2) + (time_adj >> 5); #endif -#if HZ == 250 - /* Compensate for (HZ==250) != (1 << SHIFT_HZ). - * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 6) + (-time_adj >> 7); - else - time_adj += (time_adj >> 6) + (time_adj >> 7); -#endif #if HZ == 1000 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) diff --git a/trunk/mm/Kconfig b/trunk/mm/Kconfig index 1a4473fcb2ca..391ffc54d136 100644 --- a/trunk/mm/Kconfig +++ b/trunk/mm/Kconfig @@ -111,24 +111,3 @@ config SPARSEMEM_STATIC config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC - -# eventually, we can have this option just 'select SPARSEMEM' -config MEMORY_HOTPLUG - bool "Allow for memory hot-add" - depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND - -comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND - -# Heavily threaded applications may benefit from splitting the mm-wide -# page_table_lock, so that faults on different parts of the user address -# space can be handled with less contention: split it at this NR_CPUS. -# Default to 4 for wider testing, though 8 might be more appropriate. -# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. -# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. -# -config SPLIT_PTLOCK_CPUS - int - default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT - default "4" diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile index 2fa6d2ca9f28..4cd69e3ce421 100644 --- a/trunk/mm/Makefile +++ b/trunk/mm/Makefile @@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/trunk/mm/bootmem.c b/trunk/mm/bootmem.c index e8c567177dcf..a58699b6579e 100644 --- a/trunk/mm/bootmem.c +++ b/trunk/mm/bootmem.c @@ -305,7 +305,6 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) if (j + 16 < BITS_PER_LONG) prefetchw(page + j + 16); __ClearPageReserved(page + j); - set_page_count(page + j, 0); } __free_pages(page, order); i += BITS_PER_LONG; diff --git a/trunk/mm/filemap.c b/trunk/mm/filemap.c index 768687f1d46b..1c31b2fd2ca5 100644 --- a/trunk/mm/filemap.c +++ b/trunk/mm/filemap.c @@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * * ->mmap_sem * ->i_mmap_lock - * ->page_table_lock or pte_lock (various, mainly in memory.c) + * ->page_table_lock (various places, mainly in mmap.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem @@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock - * ->page_table_lock or pte_lock (anon_vma_prepare and various) + * ->page_table_lock (anon_vma_prepare and various) * - * ->page_table_lock or pte_lock + * ->page_table_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) @@ -152,7 +152,7 @@ static int sync_page(void *word) * in the ->sync_page() methods make essential use of the * page_mapping(), merely passing the page down to the backing * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is + * ignore it for all cases but swap, where only page->private is * of interest. When page_mapping() does go NULL, the entire * call stack gracefully ignores the page and returns. * -- wli @@ -1520,7 +1520,7 @@ int filemap_populate(struct vm_area_struct *vma, unsigned long addr, page_cache_release(page); return err; } - } else if (vma->vm_flags & VM_NONLINEAR) { + } else { /* No page was found just because we can't read it in now (being * here implies nonblock != 0), but the page may exist, so set * the PTE to fault it in later. */ @@ -1537,7 +1537,6 @@ int filemap_populate(struct vm_area_struct *vma, unsigned long addr, return 0; } -EXPORT_SYMBOL(filemap_populate); struct vm_operations_struct generic_file_vm_ops = { .nopage = filemap_nopage, @@ -1556,6 +1555,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &generic_file_vm_ops; return 0; } +EXPORT_SYMBOL(filemap_populate); /* * This is for filesystems which do not implement ->writepage. diff --git a/trunk/mm/filemap_xip.c b/trunk/mm/filemap_xip.c index 9cf687e4a29a..8c199f537732 100644 --- a/trunk/mm/filemap_xip.c +++ b/trunk/mm/filemap_xip.c @@ -174,8 +174,6 @@ __xip_unmap (struct address_space * mapping, unsigned long address; pte_t *pte; pte_t pteval; - spinlock_t *ptl; - struct page *page; spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { @@ -183,17 +181,19 @@ __xip_unmap (struct address_space * mapping, address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - page = ZERO_PAGE(address); - pte = page_check_address(page, mm, address, &ptl); - if (pte) { + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + pte = page_check_address(ZERO_PAGE(address), mm, + address); + if (!IS_ERR(pte)) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); - dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); - pte_unmap_unlock(pte, ptl); - page_cache_release(page); + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); } } spin_unlock(&mapping->i_mmap_lock); @@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area, page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); if (!IS_ERR(page)) { - goto out; + return page; } if (PTR_ERR(page) != -ENODATA) return NULL; @@ -249,8 +249,6 @@ xip_file_nopage(struct vm_area_struct * area, page = ZERO_PAGE(address); } -out: - page_cache_get(page); return page; } diff --git a/trunk/mm/fremap.c b/trunk/mm/fremap.c index d862be3bc3e3..ab23a0673c35 100644 --- a/trunk/mm/fremap.c +++ b/trunk/mm/fremap.c @@ -20,32 +20,33 @@ #include #include -static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, +static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; - struct page *page = NULL; + if (pte_none(pte)) + return; if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); + flush_cache_page(vma, addr, pfn); pte = ptep_clear_flush(vma, addr, ptep); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out; + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); + dec_mm_counter(mm, rss); + } } - page = pfn_to_page(pfn); - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(mm, addr, ptep); } -out: - return !!page; } /* @@ -63,20 +64,21 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pgd_t *pgd; pte_t pte_val; - spinlock_t *ptl; - - BUG_ON(vma->vm_flags & VM_RESERVED); pgd = pgd_offset(mm, addr); + spin_lock(&mm->page_table_lock); + pud = pud_alloc(mm, pgd, addr); if (!pud) - goto out; + goto err_unlock; + pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + goto err_unlock; + + pte = pte_alloc_map(mm, pmd, addr); if (!pte) - goto out; + goto err_unlock; /* * This page may have been truncated. Tell the @@ -86,27 +88,29 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, inode = vma->vm_file->f_mapping->host; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (!page->mapping || page->index >= size) - goto unlock; + goto err_unlock; err = -ENOMEM; if (page_mapcount(page) > INT_MAX/2) - goto unlock; + goto err_unlock; - if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) - inc_mm_counter(mm, file_rss); + zap_pte(mm, vma, addr, pte); + inc_mm_counter(mm,rss); flush_icache_page(vma, page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); page_add_file_rmap(page); pte_val = *pte; + pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); + err = 0; -unlock: - pte_unmap_unlock(pte, ptl); -out: +err_unlock: + spin_unlock(&mm->page_table_lock); return err; } EXPORT_SYMBOL(install_page); + /* * Install a file pte to a given virtual memory address, release any * previously existing mapping. @@ -120,35 +124,37 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pgd_t *pgd; pte_t pte_val; - spinlock_t *ptl; - - BUG_ON(vma->vm_flags & VM_RESERVED); pgd = pgd_offset(mm, addr); + spin_lock(&mm->page_table_lock); + pud = pud_alloc(mm, pgd, addr); if (!pud) - goto out; + goto err_unlock; + pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + goto err_unlock; + + pte = pte_alloc_map(mm, pmd, addr); if (!pte) - goto out; + goto err_unlock; - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { - update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); - } + zap_pte(mm, vma, addr, pte); set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; + pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - pte_unmap_unlock(pte, ptl); - err = 0; -out: + spin_unlock(&mm->page_table_lock); + return 0; + +err_unlock: + spin_unlock(&mm->page_table_lock); return err; } + /*** * sys_remap_file_pages - remap arbitrary pages of a shared backing store * file within an existing vma. diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index c9b43360fd33..61d380678030 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -277,23 +277,19 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long addr; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - src_pte = huge_pte_offset(src, addr); - if (!src_pte) - continue; dst_pte = huge_pte_alloc(dst, addr); if (!dst_pte) goto nomem; - spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); - if (!pte_none(*src_pte)) { + src_pte = huge_pte_offset(src, addr); + if (src_pte && !pte_none(*src_pte)) { entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); - add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); } return 0; @@ -314,14 +310,12 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); - spin_lock(&mm->page_table_lock); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); - if (!ptep) + if (! ptep) + /* This can happen on truncate, or if an + * mmap() is aborted due to an error before + * the prefault */ continue; pte = huge_ptep_get_and_clear(mm, address, ptep); @@ -330,99 +324,96 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, page = pte_page(pte); put_page(page); - add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); + add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); } + flush_tlb_range(vma, start, end); +} + +void zap_hugepage_range(struct vm_area_struct *vma, + unsigned long start, unsigned long length) +{ + struct mm_struct *mm = vma->vm_mm; + spin_lock(&mm->page_table_lock); + unmap_hugepage_range(vma, start, start + length); spin_unlock(&mm->page_table_lock); - flush_tlb_range(vma, start, end); } -static struct page *find_lock_huge_page(struct address_space *mapping, - unsigned long idx) +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) { - struct page *page; - int err; - struct inode *inode = mapping->host; - unsigned long size; - -retry: - page = find_lock_page(mapping, idx); - if (page) - goto out; - - /* Check to make sure the mapping hasn't been truncated */ - size = i_size_read(inode) >> HPAGE_SHIFT; - if (idx >= size) - goto out; - - if (hugetlb_get_quota(mapping)) - goto out; - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - goto out; - } + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + hugetlb_prefault_arch_hook(mm); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + if (! ret) { + unlock_page(page); + } else { + hugetlb_put_quota(mapping); + free_huge_page(page); + goto out; + } + } + add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); } out: - return page; + spin_unlock(&mm->page_table_lock); + return ret; } +/* + * On ia64 at least, it is possible to receive a hugetlb fault from a + * stale zero entry left in the TLB from earlier hardware prefetching. + * Low-level arch code should already have flushed the stale entry as + * part of its fault handling, but we do need to accept this minor fault + * and return successfully. Whereas the "normal" case is that this is + * an access to a hugetlb page which has been truncated off since mmap. + */ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { int ret = VM_FAULT_SIGBUS; - unsigned long idx; - unsigned long size; pte_t *pte; - struct page *page; - struct address_space *mapping; - - pte = huge_pte_alloc(mm, address); - if (!pte) - goto out; - - mapping = vma->vm_file->f_mapping; - idx = ((address - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - - /* - * Use page lock to guard against racing truncation - * before we get page_table_lock. - */ - page = find_lock_huge_page(mapping, idx); - if (!page) - goto out; spin_lock(&mm->page_table_lock); - size = i_size_read(mapping->host) >> HPAGE_SHIFT; - if (idx >= size) - goto backout; - - ret = VM_FAULT_MINOR; - if (!pte_none(*pte)) - goto backout; - - add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); + pte = huge_pte_offset(mm, address); + if (pte && !pte_none(*pte)) + ret = VM_FAULT_MINOR; spin_unlock(&mm->page_table_lock); - unlock_page(page); -out: return ret; - -backout: - spin_unlock(&mm->page_table_lock); - hugetlb_put_quota(mapping); - unlock_page(page); - put_page(page); - goto out; } int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -432,36 +423,34 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vpfn, vaddr = *position; int remainder = *length; + BUG_ON(!is_vm_hugetlb_page(vma)); + vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { - pte_t *pte; - struct page *page; - /* - * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the - * first, for the page indexing below to work. - */ - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - - if (!pte || pte_none(*pte)) { - int ret; + if (pages) { + pte_t *pte; + struct page *page; + + /* Some archs (sparc64, sh*) have multiple + * pte_ts to each hugepage. We have to make + * sure we get the first, for the page + * indexing below to work. */ + pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + + /* the hugetlb file might have been truncated */ + if (!pte || pte_none(*pte)) { + remainder = 0; + if (!i) + i = -EFAULT; + break; + } - spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, 0); - spin_lock(&mm->page_table_lock); - if (ret == VM_FAULT_MINOR) - continue; + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - remainder = 0; - if (!i) - i = -EFAULT; - break; - } + WARN_ON(!PageCompound(page)); - if (pages) { - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; get_page(page); pages[i] = page; } diff --git a/trunk/mm/madvise.c b/trunk/mm/madvise.c index 17aaf3e16449..20e075d1c64c 100644 --- a/trunk/mm/madvise.c +++ b/trunk/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) + if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { diff --git a/trunk/mm/memory.c b/trunk/mm/memory.c index 0f60baf6f69b..1db40e935e55 100644 --- a/trunk/mm/memory.c +++ b/trunk/mm/memory.c @@ -114,7 +114,6 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = pmd_page(*pmd); pmd_clear(pmd); - pte_lock_deinit(page); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); tlb->mm->nr_ptes--; @@ -250,7 +249,7 @@ void free_pgd_range(struct mmu_gather **tlb, free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); - if (!(*tlb)->fullmm) + if (!tlb_is_full_mm(*tlb)) flush_tlb_pgtables((*tlb)->mm, start, end); } @@ -261,12 +260,6 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; - /* - * Hide vma from rmap and vmtruncate before freeing pgtables - */ - anon_vma_unlink(vma); - unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -279,8 +272,6 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, HPAGE_SIZE)) { vma = next; next = vma->vm_next; - anon_vma_unlink(vma); - unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -289,78 +280,72 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { - struct page *new = pte_alloc_one(mm, address); - if (!new) - return -ENOMEM; - - pte_lock_init(new); - spin_lock(&mm->page_table_lock); - if (pmd_present(*pmd)) { /* Another has populated it */ - pte_lock_deinit(new); - pte_free(new); - } else { + if (!pmd_present(*pmd)) { + struct page *new; + + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free(new); + goto out; + } mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } - spin_unlock(&mm->page_table_lock); - return 0; +out: + return pte_offset_map(pmd, address); } -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { - pte_t *new = pte_alloc_one_kernel(&init_mm, address); - if (!new) - return -ENOMEM; + if (!pmd_present(*pmd)) { + pte_t *new; - spin_lock(&init_mm.page_table_lock); - if (pmd_present(*pmd)) /* Another has populated it */ - pte_free_kernel(new); - else - pmd_populate_kernel(&init_mm, pmd, new); - spin_unlock(&init_mm.page_table_lock); - return 0; -} - -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) -{ - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); -} + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; -/* - * This function is called to print an error when a pte in a - * !VM_RESERVED region is found pointing to an invalid pfn (which - * is an error. - * - * The calling function must still handle the error. - */ -void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) -{ - printk(KERN_ERR "Bad pte = %08llx, process = %s, " - "vm_flags = %lx, vaddr = %lx\n", - (long long)pte_val(pte), - (vma->vm_mm == current->mm ? current->comm : "???"), - vma->vm_flags, vaddr); - dump_stack(); + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free_kernel(new); + goto out; + } + pmd_populate_kernel(mm, pmd, new); + } +out: + return pte_offset_kernel(pmd, address); } /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, + unsigned long addr) { - unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; unsigned long pfn; @@ -372,31 +357,28 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); + list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } } - goto out_set_pte; + set_pte_at(dst_mm, addr, dst_pte, pte); + return; } - /* If the region is VM_RESERVED, the mapping is not - * mapped via rmap - duplicate the pte as is. - */ - if (vm_flags & VM_RESERVED) - goto out_set_pte; - pfn = pte_pfn(pte); - /* If the pte points outside of valid memory but - * the region is not VM_RESERVED, we have a problem. + /* the pte points outside of valid memory, the + * mapping is assumed to be good, meaningful + * and not mapped via rmap - duplicate the + * mapping as is. */ - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out_set_pte; /* try to do something sane */ - } + page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); - page = pfn_to_page(pfn); + if (!page || PageReserved(page)) { + set_pte_at(dst_mm, addr, dst_pte, pte); + return; + } /* * If it's a COW mapping, write protect it both @@ -415,11 +397,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - page_dup_rmap(page); - rss[!!PageAnon(page)]++; - -out_set_pte: + inc_mm_counter(dst_mm, rss); + if (PageAnon(page)) + inc_mm_counter(dst_mm, anon_rss); set_pte_at(dst_mm, addr, dst_pte, pte); + page_dup_rmap(page); } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -427,44 +409,38 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; - spinlock_t *src_ptl, *dst_ptl; - int progress = 0; - int rss[2]; + unsigned long vm_flags = vma->vm_flags; + int progress; again: - rss[1] = rss[0] = 0; - dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); - src_ptl = pte_lockptr(src_mm, src_pmd); - spin_lock(src_ptl); + progress = 0; + spin_lock(&src_mm->page_table_lock); do { /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ - if (progress >= 32) { - progress = 0; - if (need_resched() || - need_lockbreak(src_ptl) || - need_lockbreak(dst_ptl)) - break; - } + if (progress >= 32 && (need_resched() || + need_lockbreak(&src_mm->page_table_lock) || + need_lockbreak(&dst_mm->page_table_lock))) + break; if (pte_none(*src_pte)) { progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + spin_unlock(&src_mm->page_table_lock); - spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); - add_mm_rss(dst_mm, rss[0], rss[1]); - pte_unmap_unlock(dst_pte - 1, dst_ptl); - cond_resched(); + pte_unmap(dst_pte - 1); + cond_resched_lock(&dst_mm->page_table_lock); if (addr != end) goto again; return 0; @@ -549,30 +525,24 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static void zap_pte_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pmd_t *pmd, +static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { - struct mm_struct *mm = tlb->mm; pte_t *pte; - spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = pte_offset_map(pmd, addr); do { pte_t ptent = *pte; if (pte_none(ptent)) continue; if (pte_present(ptent)) { struct page *page = NULL; - if (!(vma->vm_flags & VM_RESERVED)) { - unsigned long pfn = pte_pfn(ptent); - if (unlikely(!pfn_valid(pfn))) - print_bad_pte(vma, ptent, addr); - else - page = pfn_to_page(pfn); + unsigned long pfn = pte_pfn(ptent); + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; } if (unlikely(details) && page) { /* @@ -592,7 +562,7 @@ static void zap_pte_range(struct mmu_gather *tlb, page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(mm, addr, pte, + ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) @@ -600,17 +570,15 @@ static void zap_pte_range(struct mmu_gather *tlb, if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) - set_pte_at(mm, addr, pte, + set_pte_at(tlb->mm, addr, pte, pgoff_to_pte(page->index)); + if (pte_dirty(ptent)) + set_page_dirty(page); if (PageAnon(page)) - anon_rss--; - else { - if (pte_dirty(ptent)) - set_page_dirty(page); - if (pte_young(ptent)) - mark_page_accessed(page); - file_rss--; - } + dec_mm_counter(tlb->mm, anon_rss); + else if (pte_young(ptent)) + mark_page_accessed(page); + tlb->freed++; page_remove_rmap(page); tlb_remove_page(tlb, page); continue; @@ -623,15 +591,12 @@ static void zap_pte_range(struct mmu_gather *tlb, continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(mm, addr, pte, tlb->fullmm); + pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); - - add_mm_rss(mm, file_rss, anon_rss); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap(pte - 1); } -static inline void zap_pmd_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pud_t *pud, +static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -643,12 +608,11 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - zap_pte_range(tlb, vma, pmd, addr, next, details); + zap_pte_range(tlb, pmd, addr, next, details); } while (pmd++, addr = next, addr != end); } -static inline void zap_pud_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pgd_t *pgd, +static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -660,7 +624,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - zap_pmd_range(tlb, vma, pud, addr, next, details); + zap_pmd_range(tlb, pud, addr, next, details); } while (pud++, addr = next, addr != end); } @@ -681,7 +645,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - zap_pud_range(tlb, vma, pgd, addr, next, details); + zap_pud_range(tlb, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -696,6 +660,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather + * @mm: the controlling mm_struct * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -704,10 +669,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * * Returns the end address of the unmapping (restart addr if interrupted). * - * Unmap all pages in the vma list. + * Unmap all pages in the vma list. Called under page_table_lock. * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * We aim to not hold page_table_lock for too long (for scheduling latency + * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to * return the ending mmu_gather to the caller. * * Only addresses between `start' and `end' will be unmapped. @@ -719,7 +684,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, +unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) @@ -729,7 +694,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; + int fullmm = tlb_is_full_mm(*tlbp); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -769,15 +734,19 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || + need_lockbreak(&mm->page_table_lock) || (i_mmap_lock && need_lockbreak(i_mmap_lock))) { if (i_mmap_lock) { - *tlbp = NULL; + /* must reset count of rss freed */ + *tlbp = tlb_gather_mmu(mm, fullmm); goto out; } + spin_unlock(&mm->page_table_lock); cond_resched(); + spin_lock(&mm->page_table_lock); } - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + *tlbp = tlb_gather_mmu(mm, fullmm); tlb_start_valid = 0; zap_bytes = ZAP_BLOCK_SIZE; } @@ -801,93 +770,123 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long end = address + size; unsigned long nr_accounted = 0; + if (is_vm_hugetlb_page(vma)) { + zap_hugepage_range(vma, address, size); + return end; + } + lru_add_drain(); + spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); + end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); + tlb_finish_mmu(tlb, address, end); + spin_unlock(&mm->page_table_lock); return end; } /* * Do a quick page-table lookup for a single page. + * mm->page_table_lock must be held. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, - unsigned int flags) +static struct page *__follow_page(struct mm_struct *mm, unsigned long address, + int read, int write, int accessed) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; - spinlock_t *ptl; unsigned long pfn; struct page *page; - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } + page = follow_huge_addr(mm, address, write); + if (! IS_ERR(page)) + return page; - page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; + goto out; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto no_page_table; + goto out; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto no_page_table; - - if (pmd_huge(*pmd)) { - BUG_ON(flags & FOLL_GET); - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; - } + if (pmd_huge(*pmd)) + return follow_huge_pmd(mm, address, pmd, write); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + ptep = pte_offset_map(pmd, address); if (!ptep) goto out; pte = *ptep; - if (!pte_present(pte)) - goto unlock; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - pfn = pte_pfn(pte); - if (!pfn_valid(pfn)) - goto unlock; - - page = pfn_to_page(pfn); - if (flags & FOLL_GET) - get_page(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); + pte_unmap(ptep); + if (pte_present(pte)) { + if (write && !pte_write(pte)) + goto out; + if (read && !pte_read(pte)) + goto out; + pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (accessed) { + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } + return page; + } } -unlock: - pte_unmap_unlock(ptep, ptl); + out: - return page; + return NULL; +} -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate page tables. - */ - if (flags & FOLL_ANON) { - page = ZERO_PAGE(address); - if (flags & FOLL_GET) - get_page(page); - BUG_ON(flags & FOLL_WRITE); - } - return page; +inline struct page * +follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + return __follow_page(mm, address, 0, write, 1); +} + +/* + * check_user_page_readable() can be called frm niterrupt context by oprofile, + * so we need to avoid taking any non-irq-safe locks + */ +int check_user_page_readable(struct mm_struct *mm, unsigned long address) +{ + return __follow_page(mm, address, 1, 0, 0) != NULL; +} +EXPORT_SYMBOL(check_user_page_readable); + +static inline int +untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + /* Check if the vma is for an anonymous mapping. */ + if (vma->vm_ops && vma->vm_ops->nopage) + return 0; + + /* Check if page directory entry exists. */ + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return 1; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + return 1; + + /* Check if page middle directory entry exists. */ + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 1; + + /* There is a pte slot for 'address' in 'mm'. */ + return 0; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -895,19 +894,18 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags; + unsigned int flags; /* * Require read or write permissions. * If 'force' is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { - struct vm_area_struct *vma; - unsigned int foll_flags; + struct vm_area_struct * vma; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -947,8 +945,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) - || !(vm_flags & vma->vm_flags)) + if (!vma || (vma->vm_flags & VM_IO) + || !(flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -956,25 +954,29 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, &start, &len, i); continue; } - - foll_flags = FOLL_TOUCH; - if (pages) - foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->nopage)) - foll_flags |= FOLL_ANON; - + spin_lock(&mm->page_table_lock); do { + int write_access = write; struct page *page; - if (write) - foll_flags |= FOLL_WRITE; - - cond_resched(); - while (!(page = follow_page(mm, start, foll_flags))) { + cond_resched_lock(&mm->page_table_lock); + while (!(page = follow_page(mm, start, write_access))) { int ret; - ret = __handle_mm_fault(mm, vma, start, - foll_flags & FOLL_WRITE); + + /* + * Shortcut for anonymous pages. We don't want + * to force the creation of pages tables for + * insanely big anonymously mapped areas that + * nobody touched so far. This is important + * for doing a core dump for these mappings. + */ + if (!write && untouched_anonymous_page(mm,vma,start)) { + page = ZERO_PAGE(start); + break; + } + spin_unlock(&mm->page_table_lock); + ret = __handle_mm_fault(mm, vma, start, write_access); + /* * The VM_FAULT_WRITE bit tells us that do_wp_page has * broken COW when necessary, even if maybe_mkwrite @@ -982,7 +984,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * subsequent page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) - foll_flags &= ~FOLL_WRITE; + write_access = 0; switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: @@ -998,10 +1000,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } + spin_lock(&mm->page_table_lock); } if (pages) { pages[i] = page; flush_dcache_page(page); + if (!PageReserved(page)) + page_cache_get(page); } if (vmas) vmas[i] = vma; @@ -1009,6 +1014,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, start += PAGE_SIZE; len--; } while (len && start < vma->vm_end); + spin_unlock(&mm->page_table_lock); } while (len); return i; } @@ -1018,21 +1024,16 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot) { pte_t *pte; - spinlock_t *ptl; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = pte_alloc_map(mm, pmd, addr); if (!pte) return -ENOMEM; do { - struct page *page = ZERO_PAGE(addr); - pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); - page_cache_get(page); - page_add_file_rmap(page); - inc_mm_counter(mm, file_rss); + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap(pte - 1); return 0; } @@ -1082,12 +1083,14 @@ int zeromap_page_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = zeromap_pud_range(mm, pgd, addr, next, prot); if (err) break; } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); return err; } @@ -1101,17 +1104,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long pfn, pgprot_t prot) { pte_t *pte; - spinlock_t *ptl; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = pte_alloc_map(mm, pmd, addr); if (!pte) return -ENOMEM; do { BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap(pte - 1); return 0; } @@ -1170,8 +1173,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED tells the core MM not to "manage" these pages - * (e.g. refcount, mapcount, try to swap them out). + * VM_RESERVED tells swapout not to try to touch + * this region. */ vma->vm_flags |= VM_IO | VM_RESERVED; @@ -1179,6 +1182,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = remap_pud_range(mm, pgd, addr, next, @@ -1186,35 +1190,11 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); return err; } EXPORT_SYMBOL(remap_pfn_range); -/* - * handle_pte_fault chooses page fault handler according to an entry - * which was read non-atomically. Before making any commitment, on - * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_file_page - * must check under lock before unmapping the pte and proceeding - * (but do_wp_page is only called after already making such a check; - * and do_anonymous_page and do_no_page can safely check later on). - */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) -{ - int same = 1; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) - if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - same = pte_same(*page_table, orig_pte); - spin_unlock(ptl); - } -#endif - pte_unmap(page_table); - return same; -} - /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want @@ -1228,11 +1208,29 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + pte_t entry; + + entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), + vma); + ptep_establish(vma, address, page_table, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary @@ -1242,28 +1240,28 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - spinlock_t *ptl, pte_t orig_pte) +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) { struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(orig_pte); + unsigned long pfn = pte_pfn(pte); pte_t entry; - int ret = VM_FAULT_MINOR; - - BUG_ON(vma->vm_flags & VM_RESERVED); + int ret; if (unlikely(!pfn_valid(pfn))) { /* - * Page table corrupted: show pte and kill process. + * This should really halt the system so it can be debugged or + * at least the kernel stops what it's doing before it corrupts + * data, but for the moment just pretend this is OOM. */ - print_bad_pte(vma, orig_pte, address); - ret = VM_FAULT_OOM; - goto unlock; + pte_unmap(page_table); + printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", + address); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_OOM; } old_page = pfn_to_page(pfn); @@ -1272,51 +1270,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(old_page); if (reuse) { flush_cache_page(vma, address, pfn); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), + vma); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - ret |= VM_FAULT_WRITE; - goto unlock; + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_MINOR|VM_FAULT_WRITE; } } + pte_unmap(page_table); /* * Ok, we need to copy. Oh, well.. */ - page_cache_get(old_page); - pte_unmap_unlock(page_table, ptl); + if (!PageReserved(old_page)) + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto oom; + goto no_new_page; if (old_page == ZERO_PAGE(address)) { new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) - goto oom; + goto no_new_page; } else { new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) - goto oom; + goto no_new_page; copy_user_highpage(new_page, old_page, address); } - /* * Re-check the pte - we dropped the lock */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) { - page_remove_rmap(old_page); - if (!PageAnon(old_page)) { - inc_mm_counter(mm, anon_rss); - dec_mm_counter(mm, file_rss); - } + ret = VM_FAULT_MINOR; + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (likely(pte_same(*page_table, pte))) { + if (PageAnon(old_page)) + dec_mm_counter(mm, anon_rss); + if (PageReserved(old_page)) + inc_mm_counter(mm, rss); + else + page_remove_rmap(old_page); flush_cache_page(vma, address, pfn); - entry = mk_pte(new_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); + break_cow(vma, new_page, address, page_table); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1324,12 +1323,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = old_page; ret |= VM_FAULT_WRITE; } + pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); -unlock: - pte_unmap_unlock(page_table, ptl); + spin_unlock(&mm->page_table_lock); return ret; -oom: + +no_new_page: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1399,6 +1399,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma, restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); + + /* + * We cannot rely on the break test in unmap_vmas: + * on the one hand, we don't want to restart our loop + * just because that broke out for the page_table_lock; + * on the other hand, it does no test when vma is small. + */ need_break = need_resched() || need_lockbreak(details->i_mmap_lock); @@ -1647,37 +1654,38 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc } /* - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. */ -static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { - spinlock_t *ptl; struct page *page; - swp_entry_t entry; + swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - goto out; - - entry = pte_to_swp_entry(orig_pte); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); page = read_swap_cache_async(entry, vma, address); if (!page) { /* - * Back out if somebody else faulted in this pte - * while we released the pte lock. + * Back out if somebody else faulted in this pte while + * we released the page table lock. */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; - goto unlock; + else + ret = VM_FAULT_MINOR; + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + goto out; } /* Had to read the page from swap area: Major fault */ @@ -1690,11 +1698,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); /* - * Back out if somebody else already faulted in this pte. + * Back out if somebody else faulted in this pte while we + * released the page table lock. */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*page_table, orig_pte))) + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (unlikely(!pte_same(*page_table, orig_pte))) { + ret = VM_FAULT_MINOR; goto out_nomap; + } if (unlikely(!PageUptodate(page))) { ret = VM_FAULT_SIGBUS; @@ -1703,7 +1715,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* The page isn't present yet, go ahead with the fault. */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1721,7 +1733,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access) { if (do_wp_page(mm, vma, address, - page_table, pmd, ptl, pte) == VM_FAULT_OOM) + page_table, pmd, pte) == VM_FAULT_OOM) ret = VM_FAULT_OOM; goto out; } @@ -1729,76 +1741,74 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); lazy_mmu_prot_update(pte); -unlock: - pte_unmap_unlock(page_table, ptl); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); out: return ret; out_nomap: - pte_unmap_unlock(page_table, ptl); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); unlock_page(page); page_cache_release(page); - return ret; + goto out; } /* - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. */ -static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) +static int +do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + pte_t *page_table, pmd_t *pmd, int write_access, + unsigned long addr) { - struct page *page; - spinlock_t *ptl; pte_t entry; + struct page * page = ZERO_PAGE(addr); + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + /* ..except if it's a write access */ if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_zeroed_user_highpage(vma, address); + goto no_mem; + page = alloc_zeroed_user_highpage(vma, addr); if (!page) - goto oom; + goto no_mem; - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, anon_rss); + if (!pte_none(*page_table)) { + pte_unmap(page_table); + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + goto out; + } + inc_mm_counter(mm, rss); + entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, + vma->vm_page_prot)), + vma); lru_cache_add_active(page); SetPageReferenced(page); - page_add_anon_rmap(page, vma, address); - } else { - /* Map the ZERO_PAGE - vm_page_prot is readonly */ - page = ZERO_PAGE(address); - page_cache_get(page); - entry = mk_pte(page, vma->vm_page_prot); - - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, file_rss); - page_add_file_rmap(page); + page_add_anon_rmap(page, vma, addr); } - set_pte_at(mm, address, page_table, entry); + set_pte_at(mm, addr, page_table, entry); + pte_unmap(page_table); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, addr, entry); lazy_mmu_prot_update(entry); -unlock: - pte_unmap_unlock(page_table, ptl); + spin_unlock(&mm->page_table_lock); +out: return VM_FAULT_MINOR; -release: - page_cache_release(page); - goto unlock; -oom: +no_mem: return VM_FAULT_OOM; } @@ -1811,23 +1821,25 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. */ -static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) +static int +do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { - spinlock_t *ptl; - struct page *new_page; + struct page * new_page; struct address_space *mapping = NULL; pte_t entry; unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, + pmd, write_access, address); pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -1835,6 +1847,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, smp_rmb(); /* serializes i_size against truncate_count */ } retry: + cond_resched(); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* * No smp_rmb is needed here as long as there's a full @@ -1867,20 +1880,19 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, anon = 1; } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(&mm->page_table_lock); /* * For a file-backed vma, someone could have truncated or otherwise * invalidated this page. If unmap_mapping_range got called, * retry getting the page. */ if (mapping && unlikely(sequence != mapping->truncate_count)) { - pte_unmap_unlock(page_table, ptl); - page_cache_release(new_page); - cond_resched(); sequence = mapping->truncate_count; - smp_rmb(); + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); goto retry; } + page_table = pte_offset_map(pmd, address); /* * This silly early PAGE_DIRTY setting removes a race @@ -1894,67 +1906,68 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { + if (!PageReserved(new_page)) + inc_mm_counter(mm, rss); + flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else if (!(vma->vm_flags & VM_RESERVED)) { - inc_mm_counter(mm, file_rss); + } else page_add_file_rmap(new_page); - } + pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ + pte_unmap(page_table); page_cache_release(new_page); - goto unlock; + spin_unlock(&mm->page_table_lock); + goto out; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); -unlock: - pte_unmap_unlock(page_table, ptl); + spin_unlock(&mm->page_table_lock); +out: return ret; oom: page_cache_release(new_page); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out; } /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) +static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) { - pgoff_t pgoff; + unsigned long pgoff; int err; - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return VM_FAULT_MINOR; - - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, orig_pte, address); - return VM_FAULT_OOM; + BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); + /* + * Fall back to the linear mapping if the fs does not support + * ->populate: + */ + if (!vma->vm_ops->populate || + (write_access && !(vma->vm_flags & VM_SHARED))) { + pte_clear(mm, address, pte); + return do_no_page(mm, vma, address, write_access, pte, pmd); } - /* We can then assume vm->vm_ops && vma->vm_ops->populate */ - pgoff = pte_to_pgoff(orig_pte); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, - vma->vm_page_prot, pgoff, 0); + pgoff = pte_to_pgoff(*pte); + + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -1971,68 +1984,56 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. */ static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, int write_access) + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t *pte, pmd_t *pmd) { pte_t entry; - pte_t old_entry; - spinlock_t *ptl; - old_entry = entry = *pte; + entry = *pte; if (!pte_present(entry)) { - if (pte_none(entry)) { - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); - } + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte, pmd); if (pte_file(entry)) - return do_file_page(mm, vma, address, - pte, pmd, write_access, entry); - return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); + return do_file_page(mm, vma, address, write_access, pte, pmd); + return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); } - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*pte, entry))) - goto unlock; if (write_access) { if (!pte_write(entry)) - return do_wp_page(mm, vma, address, - pte, pmd, ptl, entry); + return do_wp_page(mm, vma, address, pte, pmd, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (!pte_same(old_entry, entry)) { - ptep_set_access_flags(vma, address, pte, entry, write_access); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); - } else { - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (write_access) - flush_tlb_page(vma, address); - } -unlock: - pte_unmap_unlock(pte, ptl); + ptep_set_access_flags(vma, address, pte, entry, write_access); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; } /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2047,66 +2048,100 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pud = pud_alloc(mm, pgd, address); if (!pud) - return VM_FAULT_OOM; + goto oom; + pmd = pmd_alloc(mm, pud, address); if (!pmd) - return VM_FAULT_OOM; + goto oom; + pte = pte_alloc_map(mm, pmd, address); if (!pte) - return VM_FAULT_OOM; + goto oom; + + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + oom: + spin_unlock(&mm->page_table_lock); + return VM_FAULT_OOM; } #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. - * We've already handled the fast-path in-line. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - pud_t *new = pud_alloc_one(mm, address); - if (!new) - return -ENOMEM; + pud_t *new; + spin_unlock(&mm->page_table_lock); + new = pud_alloc_one(mm, address); spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { pud_free(new); - else - pgd_populate(mm, pgd, new); - spin_unlock(&mm->page_table_lock); - return 0; + goto out; + } + pgd_populate(mm, pgd, new); + out: + return pud_offset(pgd, address); } #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. - * We've already handled the fast-path in-line. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. */ -int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - pmd_t *new = pmd_alloc_one(mm, address); - if (!new) - return -ENOMEM; + pmd_t *new; + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ #ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) /* Another has populated it */ + if (pud_present(*pud)) { pmd_free(new); - else - pud_populate(mm, pud, new); + goto out; + } + pud_populate(mm, pud, new); #else - if (pgd_present(*pud)) /* Another has populated it */ + if (pgd_present(*pud)) { pmd_free(new); - else - pgd_populate(mm, pud, new); + goto out; + } + pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - spin_unlock(&mm->page_table_lock); - return 0; + + out: + return pmd_offset(pud, address); } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -2171,6 +2206,22 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) EXPORT_SYMBOL(vmalloc_to_pfn); +/* + * update_mem_hiwater + * - update per process rss and vm high water data + */ +void update_mem_hiwater(struct task_struct *tsk) +{ + if (tsk->mm) { + unsigned long rss = get_mm_counter(tsk->mm, rss); + + if (tsk->mm->hiwater_rss < rss) + tsk->mm->hiwater_rss = rss; + if (tsk->mm->hiwater_vm < tsk->mm->total_vm) + tsk->mm->hiwater_vm = tsk->mm->total_vm; + } +} + #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) @@ -2182,7 +2233,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = VM_RESERVED; + gate_vma.vm_flags = 0; return 0; } __initcall(gate_vma_init); diff --git a/trunk/mm/memory_hotplug.c b/trunk/mm/memory_hotplug.c deleted file mode 100644 index 431a64f021c0..000000000000 --- a/trunk/mm/memory_hotplug.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * linux/mm/memory_hotplug.c - * - * Copyright (C) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, - unsigned long size); -static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; - int nid = pgdat->node_id; - int zone_type; - - zone_type = zone - pgdat->node_zones; - memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); - zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); -} - -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages); -static int __add_section(struct zone *zone, unsigned long phys_start_pfn) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; - int ret; - - ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); - - if (ret < 0) - return ret; - - __add_zone(zone, phys_start_pfn); - return register_new_memory(__pfn_to_section(phys_start_pfn)); -} - -/* - * Reasonably generic function for adding memory. It is - * expected that archs that support memory hotplug will - * call this function after deciding the zone to which to - * add the new pages. - */ -int __add_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) -{ - unsigned long i; - int err = 0; - - for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { - err = __add_section(zone, phys_start_pfn + i); - - if (err) - break; - } - - return err; -} - -static void grow_zone_span(struct zone *zone, - unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long old_zone_end_pfn; - - zone_span_writelock(zone); - - old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - if (start_pfn < zone->zone_start_pfn) - zone->zone_start_pfn = start_pfn; - - if (end_pfn > old_zone_end_pfn) - zone->spanned_pages = end_pfn - zone->zone_start_pfn; - - zone_span_writeunlock(zone); -} - -static void grow_pgdat_span(struct pglist_data *pgdat, - unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long old_pgdat_end_pfn = - pgdat->node_start_pfn + pgdat->node_spanned_pages; - - if (start_pfn < pgdat->node_start_pfn) - pgdat->node_start_pfn = start_pfn; - - if (end_pfn > old_pgdat_end_pfn) - pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; -} - -int online_pages(unsigned long pfn, unsigned long nr_pages) -{ - unsigned long i; - unsigned long flags; - unsigned long onlined_pages = 0; - struct zone *zone; - - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_sem. - */ - zone = page_zone(pfn_to_page(pfn)); - pgdat_resize_lock(zone->zone_pgdat, &flags); - grow_zone_span(zone, pfn, pfn + nr_pages); - grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); - - for (i = 0; i < nr_pages; i++) { - struct page *page = pfn_to_page(pfn + i); - online_page(page); - onlined_pages++; - } - zone->present_pages += onlined_pages; - - setup_per_zone_pages_min(); - - return 0; -} diff --git a/trunk/mm/mempolicy.c b/trunk/mm/mempolicy.c index 2076b1542b8a..1d5c64df1653 100644 --- a/trunk/mm/mempolicy.c +++ b/trunk/mm/mempolicy.c @@ -2,7 +2,6 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. - * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -18,19 +17,13 @@ * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. - * * bind Only allocate memory on a specific set of nodes, * no fallback. - * FIXME: memory is allocated starting with the first node - * to the last. It would be better if bind would truly restrict - * the allocation to memory nodes instead - * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. - * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -100,10 +93,23 @@ struct mempolicy default_policy = { .policy = MPOL_DEFAULT, }; +/* Check if all specified nodes are online */ +static int nodes_online(unsigned long *nodes) +{ + DECLARE_BITMAP(online2, MAX_NUMNODES); + + bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); + if (bitmap_empty(online2, MAX_NUMNODES)) + set_bit(0, online2); + if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) + return -EINVAL; + return 0; +} + /* Do sanity checking on a policy */ -static int mpol_check_policy(int mode, nodemask_t *nodes) +static int mpol_check_policy(int mode, unsigned long *nodes) { - int empty = nodes_empty(*nodes); + int empty = bitmap_empty(nodes, MAX_NUMNODES); switch (mode) { case MPOL_DEFAULT: @@ -118,20 +124,71 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) return -EINVAL; break; } - return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; + return nodes_online(nodes); +} + +/* Copy a node mask from user space. */ +static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, + unsigned long maxnode, int mode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + bitmap_zero(nodes, MAX_NUMNODES); + if (maxnode == 0 || !nmask) + return 0; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes[nlongs-1] &= endmask; + /* Update current mems_allowed */ + cpuset_update_current_mems_allowed(); + /* Ignore nodes not set in current->mems_allowed */ + cpuset_restrict_to_mems_allowed(nodes); + return mpol_check_policy(mode, nodes); } + /* Generate a custom zonelist for the BIND policy. */ -static struct zonelist *bind_zonelist(nodemask_t *nodes) +static struct zonelist *bind_zonelist(unsigned long *nodes) { struct zonelist *zl; int num, max, nd; - max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); + max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); if (!zl) return NULL; num = 0; - for_each_node_mask(nd, *nodes) { + for (nd = find_first_bit(nodes, MAX_NUMNODES); + nd < MAX_NUMNODES; + nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { int k; for (k = MAX_NR_ZONES-1; k >= 0; k--) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -142,16 +199,17 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) policy_zone = k; } } + BUG_ON(num >= max); zl->zones[num] = NULL; return zl; } /* Create a new policy */ -static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) +static struct mempolicy *mpol_new(int mode, unsigned long *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); + PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -160,10 +218,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) atomic_set(&policy->refcnt, 1); switch (mode) { case MPOL_INTERLEAVE: - policy->v.nodes = *nodes; + bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); break; case MPOL_PREFERRED: - policy->v.preferred_node = first_node(*nodes); + policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); if (policy->v.preferred_node >= MAX_NUMNODES) policy->v.preferred_node = -1; break; @@ -180,14 +238,14 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) } /* Ensure all existing pages follow the policy. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, nodemask_t *nodes) +static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, unsigned long *nodes) { pte_t *orig_pte; pte_t *pte; - spinlock_t *ptl; - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + spin_lock(&mm->page_table_lock); + orig_pte = pte = pte_offset_map(pmd, addr); do { unsigned long pfn; unsigned int nid; @@ -195,20 +253,19 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!pte_present(*pte)) continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) { - print_bad_pte(vma, *pte, addr); + if (!pfn_valid(pfn)) continue; - } nid = pfn_to_nid(pfn); - if (!node_isset(nid, *nodes)) + if (!test_bit(nid, nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(orig_pte, ptl); + pte_unmap(orig_pte); + spin_unlock(&mm->page_table_lock); return addr != end; } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, nodemask_t *nodes) +static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, unsigned long *nodes) { pmd_t *pmd; unsigned long next; @@ -218,14 +275,14 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes)) + if (check_pte_range(mm, pmd, addr, next, nodes)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, nodemask_t *nodes) +static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, unsigned long *nodes) { pud_t *pud; unsigned long next; @@ -235,24 +292,24 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes)) + if (check_pmd_range(mm, pud, addr, next, nodes)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, nodemask_t *nodes) +static inline int check_pgd_range(struct mm_struct *mm, + unsigned long addr, unsigned long end, unsigned long *nodes) { pgd_t *pgd; unsigned long next; - pgd = pgd_offset(vma->vm_mm, addr); + pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes)) + if (check_pud_range(mm, pgd, addr, next, nodes)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; @@ -261,7 +318,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, /* Step 1: check the range */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - nodemask_t *nodes, unsigned long flags) + unsigned long *nodes, unsigned long flags) { int err; struct vm_area_struct *first, *vma, *prev; @@ -269,8 +326,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); - if (first->vm_flags & VM_RESERVED) - return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) @@ -283,7 +338,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, endvma = end; if (vma->vm_start > start) start = vma->vm_start; - err = check_pgd_range(vma, start, endvma, nodes); + err = check_pgd_range(vma->vm_mm, + start, endvma, nodes); if (err) { first = ERR_PTR(err); break; @@ -337,25 +393,17 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -static int contextualize_policy(int mode, nodemask_t *nodes) -{ - if (!nodes) - return 0; - - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes->bits); - return mpol_check_policy(mode, nodes); -} - -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) +/* Change policy for a memory range */ +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; + DECLARE_BITMAP(nodes, MAX_NUMNODES); int err; if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) @@ -370,17 +418,20 @@ long do_mbind(unsigned long start, unsigned long len, return -EINVAL; if (end == start) return 0; - if (mpol_check_policy(mode, nmask)) - return -EINVAL; - new = mpol_new(mode, nmask); + + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) + return err; + + new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); + mode,nodes[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, flags); + vma = check_range(mm, start, end, nodes, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -390,45 +441,50 @@ long do_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -long do_set_mempolicy(int mode, nodemask_t *nodes) +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) { + int err; struct mempolicy *new; + DECLARE_BITMAP(nodes, MAX_NUMNODES); - if (contextualize_policy(mode, nodes)) + if (mode < 0 || mode > MPOL_MAX) return -EINVAL; + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) + return err; new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; if (new && new->policy == MPOL_INTERLEAVE) - current->il_next = first_node(new->v.nodes); + current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); return 0; } /* Fill a zone bitmap for a policy */ -static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) +static void get_zonemask(struct mempolicy *p, unsigned long *nodes) { int i; - nodes_clear(*nodes); + bitmap_zero(nodes, MAX_NUMNODES); switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, - *nodes); + __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); break; case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - *nodes = p->v.nodes; + bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); break; case MPOL_PREFERRED: /* or use current node instead of online map? */ if (p->v.preferred_node < 0) - *nodes = node_online_map; + bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); else - node_set(p->v.preferred_node, *nodes); + __set_bit(p->v.preferred_node, nodes); break; default: BUG(); @@ -448,17 +504,37 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) return err; } +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + void *nodes, unsigned nbytes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; +} + /* Retrieve NUMA policy */ -long do_get_mempolicy(int *policy, nodemask_t *nmask, - unsigned long addr, unsigned long flags) +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) { - int err; + int err, pval; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -481,25 +557,31 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, err = lookup_node(mm, addr); if (err < 0) goto out; - *policy = err; + pval = err; } else if (pol == current->mempolicy && pol->policy == MPOL_INTERLEAVE) { - *policy = current->il_next; + pval = current->il_next; } else { err = -EINVAL; goto out; } } else - *policy = pol->policy; + pval = pol->policy; if (vma) { up_read(¤t->mm->mmap_sem); vma = NULL; } + if (policy && put_user(pval, policy)) + return -EFAULT; + err = 0; - if (nmask) - get_zonemask(pol, nmask); + if (nmask) { + DECLARE_BITMAP(nodes, MAX_NUMNODES); + get_zonemask(pol, nodes); + err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); + } out: if (vma) @@ -507,126 +589,6 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } -/* - * User space interface with variable sized bitmaps for nodelists. - */ - -/* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, - unsigned long maxnode) -{ - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - - --maxnode; - nodes_clear(*nodes); - if (maxnode == 0 || !nmask) - return 0; - - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - return 0; -} - -/* Copy a kernel node mask to user space */ -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - nodemask_t *nodes) -{ - unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); - - if (copy > nbytes) { - if (copy > PAGE_SIZE) - return -EINVAL; - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) - return -EFAULT; - copy = nbytes; - } - return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; -} - -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) -{ - nodemask_t nodes; - int err; - - err = get_nodes(&nodes, nmask, maxnode); - if (err) - return err; - return do_mbind(start, len, mode, &nodes, flags); -} - -/* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) -{ - int err; - nodemask_t nodes; - - if (mode < 0 || mode > MPOL_MAX) - return -EINVAL; - err = get_nodes(&nodes, nmask, maxnode); - if (err) - return err; - return do_set_mempolicy(mode, &nodes); -} - -/* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) -{ - int err, pval; - nodemask_t nodes; - - if (nmask != NULL && maxnode < MAX_NUMNODES) - return -EINVAL; - - err = do_get_mempolicy(&pval, &nodes, addr, flags); - - if (err) - return err; - - if (policy && put_user(pval, policy)) - return -EFAULT; - - if (nmask) - err = copy_nodes_to_user(nmask, maxnode, &nodes); - - return err; -} - #ifdef CONFIG_COMPAT asmlinkage long compat_sys_get_mempolicy(int __user *policy, @@ -687,15 +649,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; - nodemask_t bm; + DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { - err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); + err = compat_get_bitmap(bm, nmask, nr_bits); nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, nodes_addr(bm), alloc_size); + err |= copy_to_user(nm, bm, alloc_size); } if (err) @@ -714,7 +676,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); + pol = vma->vm_ops->get_policy(vma, addr); else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; @@ -760,9 +722,10 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); + BUG_ON(nid >= MAX_NUMNODES); + next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = find_first_bit(policy->v.nodes, MAX_NUMNODES); me->il_next = next; return nid; } @@ -771,27 +734,29 @@ static unsigned interleave_nodes(struct mempolicy *policy) static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) { - unsigned nnodes = nodes_weight(pol->v.nodes); + unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); unsigned target = (unsigned)off % nnodes; int c; int nid = -1; c = 0; do { - nid = next_node(nid, pol->v.nodes); + nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); c++; } while (c <= target); + BUG_ON(nid >= MAX_NUMNODES); + BUG_ON(!test_bit(nid, pol->v.nodes)); return nid; } /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { struct zonelist *zl; struct page *page; + BUG_ON(!node_online(nid)); zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { @@ -834,6 +799,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) unsigned nid; if (vma) { unsigned long off; + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); off = vma->vm_pgoff; off += (addr - vma->vm_start) >> PAGE_SHIFT; nid = offset_il_node(pol, vma, off); @@ -911,7 +878,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_DEFAULT: return 1; case MPOL_INTERLEAVE: - return nodes_equal(a->v.nodes, b->v.nodes); + return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; case MPOL_BIND: { @@ -1150,7 +1117,7 @@ int mpol_set_shared_policy(struct shared_policy *info, PDprintk("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? npol->v.nodes[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -1197,12 +1164,14 @@ void __init numa_policy_init(void) /* Set interleaving policy for system init. This way not all the data structures allocated at system boot end up in node zero. */ - if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) + if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), + MAX_NUMNODES) < 0) printk("numa_policy_init: interleaving failed\n"); } -/* Reset policy of current process to default */ +/* Reset policy of current process to default. + * Assumes fs == KERNEL_DS */ void numa_default_policy(void) { - do_set_mempolicy(MPOL_DEFAULT, NULL); + sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); } diff --git a/trunk/mm/mmap.c b/trunk/mm/mmap.c index 5ecc2cf3e1d7..fa11d91242e8 100644 --- a/trunk/mm/mmap.c +++ b/trunk/mm/mmap.c @@ -181,36 +181,26 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, } /* - * Unlink a file-based vm structure from its prio_tree, to hide - * vma from rmap and vmtruncate before freeing its page tables. + * Remove one vm structure and free it. */ -void unlink_file_vma(struct vm_area_struct *vma) +static void remove_vm_struct(struct vm_area_struct *vma) { struct file *file = vma->vm_file; + might_sleep(); if (file) { struct address_space *mapping = file->f_mapping; spin_lock(&mapping->i_mmap_lock); __remove_shared_vm_struct(vma, file, mapping); spin_unlock(&mapping->i_mmap_lock); } -} - -/* - * Close a vm structure and free it, returning the next. - */ -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) -{ - struct vm_area_struct *next = vma->vm_next; - - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); + if (file) + fput(file); + anon_vma_unlink(vma); mpol_free(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); - return next; } asmlinkage unsigned long sys_brk(unsigned long brk) @@ -842,7 +832,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) } #ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *mm, unsigned long flags, +void __vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { const unsigned long stack_flags @@ -1080,17 +1070,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) - == (VM_WRITE | VM_RESERVED)) { - printk(KERN_WARNING "program %s is using MAP_PRIVATE, " - "PROT_WRITE mmap of VM_RESERVED memory, which " - "is deprecated. Please report this to " - "linux-kernel@vger.kernel.org\n",current->comm); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - error = -EACCES; - goto unmap_and_free_vma; - } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) @@ -1131,7 +1110,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } out: mm->total_vm += len >> PAGE_SHIFT; - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); @@ -1496,19 +1475,15 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } -#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +#ifdef CONFIG_STACK_GROWSUP /* - * PA-RISC uses this for its stack; IA64 for its Register Backing Store. - * vma is the last one with address > vma->vm_end. Have to extend vma. + * vma is the first one with address > vma->vm_end. Have to extend vma. */ -#ifdef CONFIG_STACK_GROWSUP -static inline -#endif -int expand_upwards(struct vm_area_struct *vma, unsigned long address) +int expand_stack(struct vm_area_struct * vma, unsigned long address) { int error; @@ -1546,13 +1521,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock(vma); return error; } -#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ - -#ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return expand_upwards(vma, address); -} struct vm_area_struct * find_extend_vma(struct mm_struct *mm, unsigned long addr) @@ -1635,24 +1603,36 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) } #endif +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list. + */ +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) +{ + size_t len = area->vm_end - area->vm_start; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + vm_stat_unaccount(area); + remove_vm_struct(area); +} + /* - * Ok - we have the memory areas we should free on the vma list, - * so release them, and do the vma updates. + * Update the VMA and inode share lists. * - * Called with the mm semaphore held. + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and do the vma updates. */ -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { - /* Update high watermark before we lower total_vm */ - update_hiwater_vm(mm); do { - long nrpages = vma_pages(vma); - - mm->total_vm -= nrpages; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); - vma = remove_vma(vma); + struct vm_area_struct *next = vma->vm_next; + unmap_vma(mm, vma); + vma = next; } while (vma); validate_mm(mm); } @@ -1671,13 +1651,14 @@ static void unmap_region(struct mm_struct *mm, unsigned long nr_accounted = 0; lru_add_drain(); + spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); + spin_unlock(&mm->page_table_lock); } /* @@ -1818,7 +1799,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unmap_region(mm, vma, prev, start, end); /* Fix up all other VM information */ - remove_vma_list(mm, vma); + unmap_vma_list(mm, vma); return 0; } @@ -1952,21 +1933,34 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; lru_add_drain(); + + spin_lock(&mm->page_table_lock); + flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); - /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + set_mm_counter(mm, rss, 0); + mm->total_vm = 0; + mm->locked_vm = 0; + + spin_unlock(&mm->page_table_lock); + /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. + * Walk the list again, actually closing and freeing it + * without holding any MM locks. */ - while (vma) - vma = remove_vma(vma); + while (vma) { + struct vm_area_struct *next = vma->vm_next; + remove_vm_struct(vma); + vma = next; + } BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } diff --git a/trunk/mm/mprotect.c b/trunk/mm/mprotect.c index 17a2b52b753b..57577f63b305 100644 --- a/trunk/mm/mprotect.c +++ b/trunk/mm/mprotect.c @@ -29,9 +29,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { pte_t *pte; - spinlock_t *ptl; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = pte_offset_map(pmd, addr); do { if (pte_present(*pte)) { pte_t ptent; @@ -45,7 +44,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, lazy_mmu_prot_update(ptent); } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap(pte - 1); } static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -89,6 +88,7 @@ static void change_protection(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -96,6 +96,7 @@ static void change_protection(struct vm_area_struct *vma, change_pud_range(mm, pgd, addr, next, newprot); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); + spin_unlock(&mm->page_table_lock); } static int @@ -124,14 +125,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (oldflags & VM_RESERVED) { - BUG_ON(oldflags & VM_WRITE); - printk(KERN_WARNING "program %s is using MAP_PRIVATE, " - "PROT_WRITE mprotect of VM_RESERVED memory, " - "which is deprecated. Please report this to " - "linux-kernel@vger.kernel.org\n",current->comm); - return -EACCES; - } if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { charged = nrpages; if (security_vm_enough_memory(charged)) @@ -175,8 +168,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, vma->vm_flags = newflags; vma->vm_page_prot = newprot; change_protection(vma, start, end, newprot); - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); + __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + __vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; fail: diff --git a/trunk/mm/mremap.c b/trunk/mm/mremap.c index b535438c363c..f343fc73a8bd 100644 --- a/trunk/mm/mremap.c +++ b/trunk/mm/mremap.c @@ -22,7 +22,35 @@ #include #include -static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + goto end; + + pud = pud_offset(pgd, addr); + if (pud_none_or_clear_bad(pud)) + goto end; + + pmd = pmd_offset(pud, addr); + if (pmd_none_or_clear_bad(pmd)) + goto end; + + pte = pte_offset_map_nested(pmd, addr); + if (pte_none(*pte)) { + pte_unmap_nested(pte); + pte = NULL; + } +end: + return pte; +} + +static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -40,39 +68,35 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) if (pmd_none_or_clear_bad(pmd)) return NULL; - return pmd; + return pte_offset_map(pmd, addr); } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pte_t *pte = NULL; pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); if (!pud) return NULL; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return NULL; - - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) - return NULL; - - return pmd; + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); + return pte; } -static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, - unsigned long old_addr, unsigned long old_end, - struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr) +static int +move_one_page(struct vm_area_struct *vma, unsigned long old_addr, + struct vm_area_struct *new_vma, unsigned long new_addr) { struct address_space *mapping = NULL; struct mm_struct *mm = vma->vm_mm; - pte_t *old_pte, *new_pte, pte; - spinlock_t *old_ptl, *new_ptl; + int error = 0; + pte_t *src, *dst; if (vma->vm_file) { /* @@ -87,69 +111,74 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_vma->vm_truncate_count != vma->vm_truncate_count) new_vma->vm_truncate_count = 0; } + spin_lock(&mm->page_table_lock); - /* - * We don't have to worry about the ordering of src and dst - * pte locks because exclusive mmap_sem prevents deadlock. - */ - old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map_nested(new_pmd, new_addr); - new_ptl = pte_lockptr(mm, new_pmd); - if (new_ptl != old_ptl) - spin_lock(new_ptl); - - for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, - new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(*old_pte)) - continue; - pte = ptep_clear_flush(vma, old_addr, old_pte); - /* ZERO_PAGE can be dependant on virtual addr */ - pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + src = get_one_pte_map_nested(mm, old_addr); + if (src) { + /* + * Look to see whether alloc_one_pte_map needs to perform a + * memory allocation. If it does then we need to drop the + * atomic kmap + */ + dst = get_one_pte_map(mm, new_addr); + if (unlikely(!dst)) { + pte_unmap_nested(src); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + dst = alloc_one_pte_map(mm, new_addr); + if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { + spin_unlock(&mm->page_table_lock); + spin_lock(&mapping->i_mmap_lock); + spin_lock(&mm->page_table_lock); + } + src = get_one_pte_map_nested(mm, old_addr); + } + /* + * Since alloc_one_pte_map can drop and re-acquire + * page_table_lock, we should re-check the src entry... + */ + if (src) { + if (dst) { + pte_t pte; + pte = ptep_clear_flush(vma, old_addr, src); + + /* ZERO_PAGE can be dependant on virtual addr */ + pte = move_pte(pte, new_vma->vm_page_prot, + old_addr, new_addr); + set_pte_at(mm, new_addr, dst, pte); + } else + error = -ENOMEM; + pte_unmap_nested(src); + } + if (dst) + pte_unmap(dst); } - - if (new_ptl != old_ptl) - spin_unlock(new_ptl); - pte_unmap_nested(new_pte - 1); - pte_unmap_unlock(old_pte - 1, old_ptl); + spin_unlock(&mm->page_table_lock); if (mapping) spin_unlock(&mapping->i_mmap_lock); + return error; } -#define LATENCY_LIMIT (64 * PAGE_SIZE) - static unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len) { - unsigned long extent, next, old_end; - pmd_t *old_pmd, *new_pmd; + unsigned long offset; - old_end = old_addr + len; - flush_cache_range(vma, old_addr, old_end); + flush_cache_range(vma, old_addr, old_addr + len); - for (; old_addr < old_end; old_addr += extent, new_addr += extent) { - cond_resched(); - next = (old_addr + PMD_SIZE) & PMD_MASK; - if (next - 1 > old_end) - next = old_end; - extent = next - old_addr; - old_pmd = get_old_pmd(vma->vm_mm, old_addr); - if (!old_pmd) - continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); - if (!new_pmd) + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + for (offset = 0; offset < len; offset += PAGE_SIZE) { + if (move_one_page(vma, old_addr + offset, + new_vma, new_addr + offset) < 0) break; - next = (new_addr + PMD_SIZE) & PMD_MASK; - if (extent > next - new_addr) - extent = next - new_addr; - if (extent > LATENCY_LIMIT) - extent = LATENCY_LIMIT; - move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr); + cond_resched(); } - - return len + old_addr - old_end; /* how much done */ + return offset; } static unsigned long move_vma(struct vm_area_struct *vma, @@ -162,7 +191,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long new_pgoff; unsigned long moved_len; unsigned long excess = 0; - unsigned long hiwater_vm; int split = 0; /* @@ -201,24 +229,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* - * If we failed to move page tables we still do total_vm increment - * since do_munmap() will decrement it by old_len == new_len. - * - * Since total_vm is about to be raised artificially high for a - * moment, we need to restore high watermark afterwards: if stats - * are taken meanwhile, total_vm and hiwater_vm appear too high. - * If this were a serious issue, we'd add a flag to do_munmap(). + * if we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len */ - hiwater_vm = mm->hiwater_vm; mm->total_vm += new_len >> PAGE_SHIFT; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; } - mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (excess) { @@ -248,7 +269,6 @@ unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; @@ -289,7 +309,7 @@ unsigned long do_mremap(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; - ret = do_munmap(mm, new_addr, new_len); + ret = do_munmap(current->mm, new_addr, new_len); if (ret) goto out; } @@ -300,7 +320,7 @@ unsigned long do_mremap(unsigned long addr, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(current->mm, addr+new_len, old_len - new_len); if (ret && old_len != new_len) goto out; ret = addr; @@ -313,7 +333,7 @@ unsigned long do_mremap(unsigned long addr, * Ok, we need to grow.. or relocate. */ ret = -EFAULT; - vma = find_vma(mm, addr); + vma = find_vma(current->mm, addr); if (!vma || vma->vm_start > addr) goto out; if (is_vm_hugetlb_page(vma)) { @@ -329,14 +349,14 @@ unsigned long do_mremap(unsigned long addr, } if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; + locked = current->mm->locked_vm << PAGE_SHIFT; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; locked += new_len - old_len; ret = -EAGAIN; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) goto out; } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { + if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { ret = -ENOMEM; goto out; } @@ -363,10 +383,11 @@ unsigned long do_mremap(unsigned long addr, vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL); - mm->total_vm += pages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + current->mm->total_vm += pages; + __vm_stat_account(vma->vm_mm, vma->vm_flags, + vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { - mm->locked_vm += pages; + current->mm->locked_vm += pages; make_pages_present(addr + old_len, addr + new_len); } diff --git a/trunk/mm/msync.c b/trunk/mm/msync.c index 0e040e9c39d8..d0f5a1bce7cb 100644 --- a/trunk/mm/msync.c +++ b/trunk/mm/msync.c @@ -17,48 +17,40 @@ #include #include -static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ + +static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; - spinlock_t *ptl; - int progress = 0; -again: - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + pte = pte_offset_map(pmd, addr); do { unsigned long pfn; struct page *page; - if (progress >= 64) { - progress = 0; - if (need_resched() || need_lockbreak(ptl)) - break; - } - progress++; if (!pte_present(*pte)) continue; if (!pte_maybe_dirty(*pte)) continue; pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, addr); + if (!pfn_valid(pfn)) continue; - } page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) set_page_dirty(page); - progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - if (addr != end) - goto again; + pte_unmap(pte - 1); } -static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; @@ -69,11 +61,11 @@ static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - msync_pte_range(vma, pmd, addr, next); + sync_pte_range(vma, pmd, addr, next); } while (pmd++, addr = next, addr != end); } -static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; @@ -84,34 +76,58 @@ static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - msync_pmd_range(vma, pud, addr, next); + sync_pmd_range(vma, pud, addr, next); } while (pud++, addr = next, addr != end); } -static void msync_page_range(struct vm_area_struct *vma, +static void sync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need - * to do anything more on an msync(). - * Can't do anything with VM_RESERVED regions either. - */ - if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) + * to do anything more on an msync() */ + if (is_vm_hugetlb_page(vma)) return; BUG_ON(addr >= end); - pgd = pgd_offset(vma->vm_mm, addr); + pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - msync_pud_range(vma, pgd, addr, next); + sync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); +} + +#ifdef CONFIG_PREEMPT +static inline void filemap_sync(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + const size_t chunk = 64 * 1024; /* bytes */ + unsigned long next; + + do { + next = addr + chunk; + if (next > end || next < addr) + next = end; + sync_page_range(vma, addr, next); + cond_resched(); + } while (addr = next, addr != end); +} +#else +static inline void filemap_sync(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + sync_page_range(vma, addr, end); } +#endif /* * MS_SYNC syncs the entire file - including mappings. @@ -134,7 +150,7 @@ static int msync_interval(struct vm_area_struct *vma, return -EBUSY; if (file && (vma->vm_flags & VM_SHARED)) { - msync_page_range(vma, addr, end); + filemap_sync(vma, addr, end); if (flags & MS_SYNC) { struct address_space *mapping = file->f_mapping; diff --git a/trunk/mm/nommu.c b/trunk/mm/nommu.c index d1e076a487cb..0ef241ae3763 100644 --- a/trunk/mm/nommu.c +++ b/trunk/mm/nommu.c @@ -931,8 +931,6 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) realalloc -= kobjsize(vml); askedalloc -= sizeof(*vml); kfree(vml); - - update_hiwater_vm(mm); mm->total_vm -= len >> PAGE_SHIFT; #ifdef DEBUG @@ -1049,8 +1047,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page *follow_page(struct mm_struct *mm, unsigned long address, - unsigned int foll_flags) +struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) { return NULL; } @@ -1081,6 +1078,19 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) { } +void update_mem_hiwater(struct task_struct *tsk) +{ + unsigned long rss; + + if (likely(tsk->mm)) { + rss = get_mm_counter(tsk->mm, rss); + if (tsk->mm->hiwater_rss < rss) + tsk->mm->hiwater_rss = rss; + if (tsk->mm->hiwater_vm < tsk->mm->total_vm) + tsk->mm->hiwater_vm = tsk->mm->total_vm; + } +} + void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/trunk/mm/page_alloc.c b/trunk/mm/page_alloc.c index 2dbdd98426fd..94c864eac9c4 100644 --- a/trunk/mm/page_alloc.c +++ b/trunk/mm/page_alloc.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -79,44 +78,21 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; -static int page_outside_zone_boundaries(struct zone *zone, struct page *page) -{ - int ret = 0; - unsigned seq; - unsigned long pfn = page_to_pfn(page); - - do { - seq = zone_span_seqbegin(zone); - if (pfn >= zone->zone_start_pfn + zone->spanned_pages) - ret = 1; - else if (pfn < zone->zone_start_pfn) - ret = 1; - } while (zone_span_seqretry(zone, seq)); - - return ret; -} - -static int page_is_consistent(struct zone *zone, struct page *page) -{ -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) - return 0; -#endif - if (zone != page_zone(page)) - return 0; - - return 1; -} /* * Temporary debugging check for pages not lying within a given zone. */ static int bad_range(struct zone *zone, struct page *page) { - if (page_outside_zone_boundaries(zone, page)) + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) return 1; - if (!page_is_consistent(zone, page)) + if (page_to_pfn(page) < zone->zone_start_pfn) + return 1; +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 1; +#endif + if (zone != page_zone(page)) return 1; - return 0; } @@ -138,8 +114,7 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved ); + 1 << PG_writeback); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -178,7 +153,7 @@ static void prep_compound_page(struct page *page, unsigned long order) struct page *p = page + i; SetPageCompound(p); - set_page_private(p, (unsigned long)page); + p->private = (unsigned long)page; } } @@ -198,7 +173,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (!PageCompound(p)) bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) + if (p->private != (unsigned long)page) bad_page(__FUNCTION__, page); ClearPageCompound(p); } @@ -211,18 +186,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) * So, we don't need atomic page->flags operations here. */ static inline unsigned long page_order(struct page *page) { - return page_private(page); + return page->private; } static inline void set_page_order(struct page *page, int order) { - set_page_private(page, order); + page->private = order; __SetPagePrivate(page); } static inline void rmv_page_order(struct page *page) { __ClearPagePrivate(page); - set_page_private(page, 0); + page->private = 0; } /* @@ -262,13 +237,14 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (a) the buddy is free && * (b) the buddy is on the buddy system && * (c) a page and its buddy have the same order. - * for recording page's order, we use page_private(page) and PG_private. + * for recording page's order, we use page->private and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { if (PagePrivate(page) && (page_order(page) == order) && + !PageReserved(page) && page_count(page) == 0) return 1; return 0; @@ -288,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order) * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PG_Private.Page's - * order is recorded in page_private(page) field. + * order is recorded in page->private field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -351,8 +327,7 @@ static inline void free_pages_check(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_writeback ))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); @@ -480,14 +455,13 @@ static void prep_new_page(struct page *page, int order) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_writeback ))) bad_page(__FUNCTION__, page); page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); - set_page_private(page, 0); + page->private = 0; set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); } @@ -1042,7 +1016,7 @@ void __pagevec_free(struct pagevec *pvec) fastcall void __free_pages(struct page *page, unsigned int order) { - if (put_page_testzero(page)) { + if (!PageReserved(page) && put_page_testzero(page)) { if (order == 0) free_hot_page(page); else @@ -1331,9 +1305,12 @@ void show_free_areas(void) } else printk("\n"); - for_each_cpu(cpu) { + for (cpu = 0; cpu < NR_CPUS; ++cpu) { struct per_cpu_pageset *pageset; + if (!cpu_possible(cpu)) + continue; + pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) @@ -1683,7 +1660,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1697,7 +1674,7 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + set_page_count(page, 0); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); @@ -1744,29 +1721,29 @@ static int __devinit zone_batchsize(struct zone *zone) /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/2 of a meg. + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. * * OK, so we don't know how big the cache is. So guess. */ batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 512 * 1024) - batch = (512 * 1024) / PAGE_SIZE; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; /* - * We will be trying to allcoate bigger chunks of contiguous - * memory of the order of fls(batch). This should result in - * better cache coloring. + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. * - * A sanity check also to ensure that batch is still in limits. + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. */ - batch = (1 << fls(batch + batch/2)); - - if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) - batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); - + batch = (1 << fls(batch + batch/2)) - 1; return batch; } @@ -1778,7 +1755,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 0; + pcp->low = 2 * batch; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); @@ -1787,7 +1764,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->low = 0; pcp->high = 2 * batch; - pcp->batch = max(1UL, batch/2); + pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); } @@ -1896,60 +1873,6 @@ void __init setup_per_cpu_pageset() #endif -static __devinit -void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - struct pglist_data *pgdat = zone->zone_pgdat; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(zone_size_pages); - zone->wait_table_bits = wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); -} - -static __devinit void zone_pcp_init(struct zone *zone) -{ - int cpu; - unsigned long batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); -} - -static __devinit void init_currently_empty_zone(struct zone *zone, - unsigned long zone_start_pfn, unsigned long size) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - - zone_wait_table_init(zone, size); - pgdat->nr_zones = zone_idx(zone) + 1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); -} - /* * Set up the zone data structures: * - mark all pages reserved @@ -1959,11 +1882,10 @@ static __devinit void init_currently_empty_zone(struct zone *zone, static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long j; - int nid = pgdat->node_id; + unsigned long i, j; + int cpu, nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; - pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1971,6 +1893,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; + unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) @@ -1985,13 +1908,24 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); - zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - zone_pcp_init(zone); + batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone_names[j], realsize, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -2002,9 +1936,32 @@ static void __init free_area_init_core(struct pglist_data *pgdat, if (!size) continue; + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, nid, j, zone_start_pfn); + zonetable_add(zone, nid, j, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + zone_start_pfn += size; + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2404,7 +2361,7 @@ static void setup_per_zone_lowmem_reserve(void) * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +static void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; diff --git a/trunk/mm/page_io.c b/trunk/mm/page_io.c index bb2b0d53889c..330e00d6db00 100644 --- a/trunk/mm/page_io.c +++ b/trunk/mm/page_io.c @@ -91,8 +91,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page_private(page), page, - end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -116,8 +115,7 @@ int swap_readpage(struct file *file, struct page *page) BUG_ON(!PageLocked(page)); ClearPageUptodate(page); - bio = get_swap_bio(GFP_KERNEL, page_private(page), page, - end_swap_bio_read); + bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; diff --git a/trunk/mm/rmap.c b/trunk/mm/rmap.c index 914d04b98bee..450f5241b5a5 100644 --- a/trunk/mm/rmap.c +++ b/trunk/mm/rmap.c @@ -32,7 +32,7 @@ * page->flags PG_locked (lock_page) * mapping->i_mmap_lock * anon_vma->lock - * mm->page_table_lock or pte_lock + * mm->page_table_lock * zone->lru_lock (in mark_page_accessed) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) @@ -244,44 +244,37 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * - * On success returns with pte mapped and locked. + * On success returns with mapped pte and locked mm->page_table_lock. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp) + unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - spinlock_t *ptl; + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - - pte = pte_offset_map(pmd, address); - /* Make a quick check before getting the lock */ - if (!pte_present(*pte)) { - pte_unmap(pte); - return NULL; - } - - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { - *ptlp = ptl; - return pte; + if (likely(pgd_present(*pgd))) { + pud = pud_offset(pgd, address); + if (likely(pud_present(*pud))) { + pmd = pmd_offset(pud, address); + if (likely(pmd_present(*pmd))) { + pte = pte_offset_map(pmd, address); + if (likely(pte_present(*pte) && + page_to_pfn(page) == pte_pfn(*pte))) + return pte; + pte_unmap(pte); + } + } } - pte_unmap_unlock(pte, ptl); - return NULL; + spin_unlock(&mm->page_table_lock); + return ERR_PTR(-ENOENT); } /* @@ -294,28 +287,24 @@ static int page_referenced_one(struct page *page, struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *pte; - spinlock_t *ptl; int referenced = 0; address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); - if (!pte) - goto out; - - if (ptep_clear_flush_young(vma, address, pte)) - referenced++; + pte = page_check_address(page, mm, address); + if (!IS_ERR(pte)) { + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && !ignore_token && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; + if (mm != current->mm && !ignore_token && has_swap_token(mm)) + referenced++; - (*mapcount)--; - pte_unmap_unlock(pte, ptl); + (*mapcount)--; + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + } out: return referenced; } @@ -445,11 +434,15 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the pte lock. + * The caller needs to hold the mm->page_table_lock. */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + BUG_ON(PageReserved(page)); + + inc_mm_counter(vma->vm_mm, anon_rss); + if (atomic_inc_and_test(&page->_mapcount)) { struct anon_vma *anon_vma = vma->anon_vma; @@ -468,12 +461,13 @@ void page_add_anon_rmap(struct page *page, * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to * - * The caller needs to hold the pte lock. + * The caller needs to hold the mm->page_table_lock. */ void page_add_file_rmap(struct page *page) { BUG_ON(PageAnon(page)); - BUG_ON(!pfn_valid(page_to_pfn(page))); + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; if (atomic_inc_and_test(&page->_mapcount)) inc_page_state(nr_mapped); @@ -483,10 +477,12 @@ void page_add_file_rmap(struct page *page) * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from * - * The caller needs to hold the pte lock. + * Caller needs to hold the mm->page_table_lock. */ void page_remove_rmap(struct page *page) { + BUG_ON(PageReserved(page)); + if (atomic_add_negative(-1, &page->_mapcount)) { BUG_ON(page_mapcount(page) < 0); /* @@ -514,15 +510,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) unsigned long address; pte_t *pte; pte_t pteval; - spinlock_t *ptl; int ret = SWAP_AGAIN; address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); - if (!pte) + pte = page_check_address(page, mm, address); + if (IS_ERR(pte)) goto out; /* @@ -546,11 +541,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) if (pte_dirty(pteval)) set_page_dirty(page); - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - if (PageAnon(page)) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = { .val = page->private }; /* * Store the swap location in the pte. * See handle_pte_fault() ... @@ -559,21 +551,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) swap_duplicate(entry); if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); + list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); dec_mm_counter(mm, anon_rss); - } else - dec_mm_counter(mm, file_rss); + } + dec_mm_counter(mm, rss); page_remove_rmap(page); page_cache_release(page); out_unmap: - pte_unmap_unlock(pte, ptl); + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); out: return ret; } @@ -607,14 +599,19 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte; + pte_t *pte, *original_pte; pte_t pteval; - spinlock_t *ptl; struct page *page; unsigned long address; unsigned long end; unsigned long pfn; + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + spin_lock(&mm->page_table_lock); + address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; if (address < vma->vm_start) @@ -624,33 +621,30 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - return; + goto out_unlock; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - return; + goto out_unlock; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) - return; - - pte = pte_offset_map_lock(mm, pmd, address, &ptl); + goto out_unlock; - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); + for (original_pte = pte = pte_offset_map(pmd, address); + address < end; pte++, address += PAGE_SIZE) { - for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, address); + if (!pfn_valid(pfn)) continue; - } page = pfn_to_page(pfn); BUG_ON(PageAnon(page)); + if (PageReserved(page)) + continue; if (ptep_clear_flush_young(vma, address, pte)) continue; @@ -669,10 +663,13 @@ static void try_to_unmap_cluster(unsigned long cursor, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, rss); (*mapcount)--; } - pte_unmap_unlock(pte - 1, ptl); + + pte_unmap(original_pte); +out_unlock: + spin_unlock(&mm->page_table_lock); } static int try_to_unmap_anon(struct page *page) @@ -809,6 +806,7 @@ int try_to_unmap(struct page *page) { int ret; + BUG_ON(PageReserved(page)); BUG_ON(!PageLocked(page)); if (PageAnon(page)) diff --git a/trunk/mm/shmem.c b/trunk/mm/shmem.c index dc25565a61e9..55e04a0734c1 100644 --- a/trunk/mm/shmem.c +++ b/trunk/mm/shmem.c @@ -71,6 +71,9 @@ /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Keep swapped page count in private field of indirect struct page */ +#define nr_swapped private + /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ enum sgp_type { SGP_QUICK, /* don't try more than file page cache lookup */ @@ -321,10 +324,8 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns entry->val = value; info->swapped += incdec; - if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { - struct page *page = kmap_atomic_to_page(entry); - set_page_private(page, page_private(page) + incdec); - } + if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) + kmap_atomic_to_page(entry)->nr_swapped += incdec; } /* @@ -367,8 +368,9 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); - if (page) - set_page_private(page, 0); + if (page) { + page->nr_swapped = 0; + } spin_lock(&info->lock); if (!page) { @@ -559,7 +561,7 @@ static void shmem_truncate(struct inode *inode) diroff = 0; } subdir = dir[diroff]; - if (subdir && page_private(subdir)) { + if (subdir && subdir->nr_swapped) { size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; @@ -570,10 +572,10 @@ static void shmem_truncate(struct inode *inode) nr_swaps_freed += freed; if (offset) spin_lock(&info->lock); - set_page_private(subdir, page_private(subdir) - freed); + subdir->nr_swapped -= freed; if (offset) spin_unlock(&info->lock); - BUG_ON(page_private(subdir) > offset); + BUG_ON(subdir->nr_swapped > offset); } if (offset) offset = 0; @@ -741,7 +743,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s dir = shmem_dir_map(subdir); } subdir = *dir; - if (subdir && page_private(subdir)) { + if (subdir && subdir->nr_swapped) { ptr = shmem_swp_map(subdir); size = limit - idx; if (size > ENTRIES_PER_PAGE) @@ -1199,7 +1201,7 @@ static int shmem_populate(struct vm_area_struct *vma, page_cache_release(page); return err; } - } else if (vma->vm_flags & VM_NONLINEAR) { + } else { /* No page was found just because we can't read it in * now (being here implies nonblock != 0), but the page * may exist, so set the PTE to fault it in later. */ @@ -1504,10 +1506,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ */ if (!offset) mark_page_accessed(page); - } else { + } else page = ZERO_PAGE(0); - page_cache_get(page); - } /* * Ok, we have the page, and it's up-to-date, so diff --git a/trunk/mm/slab.c b/trunk/mm/slab.c index 22bfb0b2ac8b..d30423f167a2 100644 --- a/trunk/mm/slab.c +++ b/trunk/mm/slab.c @@ -2419,7 +2419,6 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) next = slab_bufctl(slabp)[slabp->free]; #if DEBUG slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; - WARN_ON(numa_node_id() != slabp->nodeid); #endif slabp->free = next; } @@ -2634,10 +2633,8 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n check_spinlock_acquired_node(cachep, node); check_slabp(cachep, slabp); -#if DEBUG - /* Verify that the slab belongs to the intended node */ - WARN_ON(slabp->nodeid != node); +#if DEBUG if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); diff --git a/trunk/mm/sparse.c b/trunk/mm/sparse.c index 72079b538e2d..347249a4917a 100644 --- a/trunk/mm/sparse.c +++ b/trunk/mm/sparse.c @@ -5,10 +5,8 @@ #include #include #include -#include #include #include -#include #include /* @@ -74,31 +72,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -/* - * Although written for the SPARSEMEM_EXTREME case, this happens - * to also work for the flat array case becase - * NR_SECTION_ROOTS==NR_MEM_SECTIONS. - */ -int __section_nr(struct mem_section* ms) -{ - unsigned long root_nr; - struct mem_section* root; - - for (root_nr = 0; - root_nr < NR_MEM_SECTIONS; - root_nr += SECTIONS_PER_ROOT) { - root = __nr_to_section(root_nr); - - if (!root) - continue; - - if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) - break; - } - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); -} - /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { @@ -189,45 +162,6 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum) return NULL; } -static struct page *__kmalloc_section_memmap(unsigned long nr_pages) -{ - struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * nr_pages; - - page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); - if (page) - goto got_map_page; - - ret = vmalloc(memmap_size); - if (ret) - goto got_map_ptr; - - return NULL; -got_map_page: - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); -got_map_ptr: - memset(ret, 0, memmap_size); - - return ret; -} - -static int vaddr_in_vmalloc_area(void *addr) -{ - if (addr >= (void *)VMALLOC_START && - addr < (void *)VMALLOC_END) - return 1; - return 0; -} - -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) -{ - if (vaddr_in_vmalloc_area(memmap)) - vfree(memmap); - else - free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * nr_pages)); -} - /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -253,37 +187,14 @@ void sparse_init(void) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages) +int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; - struct mem_section *ms; - struct page *memmap; - unsigned long flags; - int ret; - - /* - * no locking for this, because it does its own - * plus, it does a kmalloc - */ - sparse_index_init(section_nr, pgdat->node_id); - memmap = __kmalloc_section_memmap(nr_pages); + struct mem_section *ms = __pfn_to_section(start_pfn); - pgdat_resize_lock(pgdat, &flags); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) + return -EEXIST; - ms = __pfn_to_section(start_pfn); - if (ms->section_mem_map & SECTION_MARKED_PRESENT) { - ret = -EEXIST; - goto out; - } ms->section_mem_map |= SECTION_MARKED_PRESENT; - ret = sparse_init_one_section(ms, section_nr, memmap); - - if (ret <= 0) - __kfree_section_memmap(memmap, nr_pages); -out: - pgdat_resize_unlock(pgdat, &flags); - return ret; + return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); } diff --git a/trunk/mm/swap.c b/trunk/mm/swap.c index b89512877ec2..7771d2803f62 100644 --- a/trunk/mm/swap.c +++ b/trunk/mm/swap.c @@ -39,7 +39,7 @@ int page_cluster; void put_page(struct page *page) { if (unlikely(PageCompound(page))) { - page = (struct page *)page_private(page); + page = (struct page *)page->private; if (put_page_testzero(page)) { void (*dtor)(struct page *page); @@ -48,7 +48,7 @@ void put_page(struct page *page) } return; } - if (put_page_testzero(page)) + if (!PageReserved(page) && put_page_testzero(page)) __page_cache_release(page); } EXPORT_SYMBOL(put_page); @@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold) struct page *page = pages[i]; struct zone *pagezone; - if (!put_page_testzero(page)) + if (PageReserved(page) || !put_page_testzero(page)) continue; pagezone = page_zone(page); diff --git a/trunk/mm/swap_state.c b/trunk/mm/swap_state.c index dfd9a46755b8..132164f7d0a7 100644 --- a/trunk/mm/swap_state.c +++ b/trunk/mm/swap_state.c @@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, page_cache_get(page); SetPageLocked(page); SetPageSwapCache(page); - set_page_private(page, entry.val); + page->private = entry.val; total_swapcache_pages++; pagecache_acct(1); } @@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page) BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); - set_page_private(page, 0); + radix_tree_delete(&swapper_space.page_tree, page->private); + page->private = 0; ClearPageSwapCache(page); total_swapcache_pages--; pagecache_acct(-1); @@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page) { swp_entry_t entry; - entry.val = page_private(page); + entry.val = page->private; write_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); @@ -259,7 +259,8 @@ static inline void free_swap_cache(struct page *page) /* * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. + * this page if it is the last user of the page. Can not do a lock_page, + * as we are holding the page_table_lock spinlock. */ void free_page_and_swap_cache(struct page *page) { diff --git a/trunk/mm/swapfile.c b/trunk/mm/swapfile.c index 8970c0b74194..1dcaeda039f4 100644 --- a/trunk/mm/swapfile.c +++ b/trunk/mm/swapfile.c @@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) swp_entry_t entry; down_read(&swap_unplug_sem); - entry.val = page_private(page); + entry.val = page->private; if (PageSwapCache(page)) { struct block_device *bdev = swap_info[swp_type(entry)].bdev; struct backing_dev_info *bdi; @@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) /* * If the page is removed from swapcache from under us (with a * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page_private(page) above. - * If the WARN_ON triggers during a swapoff it maybe the race + * count to avoid reading garbage from page->private above. If + * the WARN_ON triggers during a swapoff it maybe the race * condition and it's harmless. However if it triggers without * swapoff it signals a problem. */ @@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page) struct swap_info_struct *p; swp_entry_t entry; - entry.val = page_private(page); + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Subtract the 1 for the swap cache itself */ @@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page) if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page_private(page); + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -398,14 +398,17 @@ void free_swap_and_cache(swp_entry_t entry) } /* - * No need to decide whether this PTE shares the swap entry with others, - * just let do_wp_page work it out if a write is requested later - to - * force COW, vm_page_prot omits write permission from any private vma. + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + * + * vma->vm_mm->page_table_lock is held. */ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, swp_entry_t entry, struct page *page) { - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_counter(vma->vm_mm, rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); @@ -422,25 +425,23 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { - pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; - spinlock_t *ptl; - int found = 0; + pte_t swp_pte = swp_entry_to_pte(entry); - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + pte = pte_offset_map(pmd, addr); do { /* * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); - found = 1; - break; + unuse_pte(vma, pte, addr, entry, page); + pte_unmap(pte); + return 1; } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); - return found; + pte_unmap(pte - 1); + return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -522,10 +523,12 @@ static int unuse_mm(struct mm_struct *mm, down_read(&mm->mmap_sem); lock_page(page); } + spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && unuse_vma(vma, entry, page)) break; } + spin_unlock(&mm->page_table_lock); up_read(&mm->mmap_sem); /* * Currently unuse_mm cannot fail, but leave error handling @@ -1042,7 +1045,7 @@ int page_queue_congested(struct page *page) BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff --git a/trunk/mm/thrash.c b/trunk/mm/thrash.c index eff3c18c33a1..11461f7ad830 100644 --- a/trunk/mm/thrash.c +++ b/trunk/mm/thrash.c @@ -19,7 +19,7 @@ static unsigned long swap_token_check; struct mm_struct * swap_token_mm = &init_mm; #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) -#define SWAP_TOKEN_TIMEOUT (300 * HZ) +#define SWAP_TOKEN_TIMEOUT 0 /* * Currently disabled; Needs further code to work at HZ * 300. */ diff --git a/trunk/mm/vmalloc.c b/trunk/mm/vmalloc.c index 54a90e83cb31..1150229b6366 100644 --- a/trunk/mm/vmalloc.c +++ b/trunk/mm/vmalloc.c @@ -5,7 +5,6 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 - * Numa awareness, Christoph Lameter, SGI, June 2005 */ #include @@ -89,7 +88,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, { pte_t *pte; - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel(&init_mm, pmd, addr); if (!pte) return -ENOMEM; do { @@ -147,18 +146,20 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) BUG_ON(addr >= end); pgd = pgd_offset_k(addr); + spin_lock(&init_mm.page_table_lock); do { next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages); if (err) break; } while (pgd++, addr = next, addr != end); + spin_unlock(&init_mm.page_table_lock); flush_cache_vmap((unsigned long) area->addr, end); return err; } -struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end, int node) +struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end) { struct vm_struct **p, *tmp, *area; unsigned long align = 1; @@ -177,7 +178,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, addr = ALIGN(start, align); size = PAGE_ALIGN(size); - area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); + area = kmalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) return NULL; @@ -230,12 +231,6 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, return NULL; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, flags, start, end, -1); -} - /** * get_vm_area - reserve a contingous kernel virtual area * @@ -251,11 +246,6 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) -{ - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); -} - /* Caller must hold vmlist_lock */ struct vm_struct *__remove_vm_area(void *addr) { @@ -352,6 +342,7 @@ void vfree(void *addr) BUG_ON(in_interrupt()); __vunmap(addr, 1); } + EXPORT_SYMBOL(vfree); /** @@ -369,6 +360,7 @@ void vunmap(void *addr) BUG_ON(in_interrupt()); __vunmap(addr, 0); } + EXPORT_SYMBOL(vunmap); /** @@ -400,10 +392,10 @@ void *vmap(struct page **pages, unsigned int count, return area->addr; } + EXPORT_SYMBOL(vmap); -void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) +void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) { struct page **pages; unsigned int nr_pages, array_size, i; @@ -414,9 +406,9 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) - pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); + pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); else - pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); + pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); area->pages = pages; if (!area->pages) { remove_vm_area(area->addr); @@ -426,10 +418,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, memset(area->pages, 0, array_size); for (i = 0; i < area->nr_pages; i++) { - if (node < 0) - area->pages[i] = alloc_page(gfp_mask); - else - area->pages[i] = alloc_pages_node(node, gfp_mask, 0); + area->pages[i] = alloc_page(gfp_mask); if (unlikely(!area->pages[i])) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; @@ -446,25 +435,18 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - return __vmalloc_area_node(area, gfp_mask, prot, -1); -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc - allocate virtually contiguous memory * * @size: allocation size * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages - * @node node to use for allocation or -1 * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node) +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { struct vm_struct *area; @@ -472,18 +454,13 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!size || (size >> PAGE_SHIFT) > num_physpages) return NULL; - area = get_vm_area_node(size, VM_ALLOC, node); + area = get_vm_area(size, VM_ALLOC); if (!area) return NULL; - return __vmalloc_area_node(area, gfp_mask, prot, node); + return __vmalloc_area(area, gfp_mask, prot); } -EXPORT_SYMBOL(__vmalloc_node); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) -{ - return __vmalloc_node(size, gfp_mask, prot, -1); -} EXPORT_SYMBOL(__vmalloc); /** @@ -501,25 +478,8 @@ void *vmalloc(unsigned long size) { return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); } -EXPORT_SYMBOL(vmalloc); -/** - * vmalloc_node - allocate memory on a specific node - * - * @size: allocation size - * @node; numa node - * - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. - * - * For tight cotrol over page level allocator and protection flags - * use __vmalloc() instead. - */ -void *vmalloc_node(unsigned long size, int node) -{ - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); -} -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc); #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL @@ -555,6 +515,7 @@ void *vmalloc_32(unsigned long size) { return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); } + EXPORT_SYMBOL(vmalloc_32); long vread(char *buf, char *addr, unsigned long count) diff --git a/trunk/mm/vmscan.c b/trunk/mm/vmscan.c index 135bf8ca96ee..843c87d1e61f 100644 --- a/trunk/mm/vmscan.c +++ b/trunk/mm/vmscan.c @@ -417,9 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; + if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { if (!add_to_swap(page)) goto activate_locked; } @@ -521,7 +519,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); diff --git a/trunk/sound/core/pcm_native.c b/trunk/sound/core/pcm_native.c index e97b2d162cc7..67abebabf83e 100644 --- a/trunk/sound/core/pcm_native.c +++ b/trunk/sound/core/pcm_native.c @@ -2949,7 +2949,8 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns return NOPAGE_OOM; runtime = substream->runtime; page = virt_to_page(runtime->status); - get_page(page); + if (!PageReserved(page)) + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; @@ -2991,7 +2992,8 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un return NOPAGE_OOM; runtime = substream->runtime; page = virt_to_page(runtime->control); - get_page(page); + if (!PageReserved(page)) + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; @@ -3064,7 +3066,8 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign vaddr = runtime->dma_area + offset; page = virt_to_page(vaddr); } - get_page(page); + if (!PageReserved(page)) + get_page(page); if (type) *type = VM_FAULT_MINOR; return page;