From 2b4bf769772e59dea6e91574666ae1a751a39aff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 21:48:31 +0100 Subject: [PATCH] --- yaml --- r: 138625 b: refs/heads/master c: a1413c89ae6a4b7a9a43f7768934a81ffb5c629a h: refs/heads/master i: 138623: 3d9b6851557dbe37145588f697eff2b499e851fb v: v3 --- [refs] | 2 +- trunk/Makefile | 2 +- trunk/arch/alpha/mm/init.c | 20 +- trunk/arch/arm/kernel/setup.c | 13 +- trunk/arch/arm/mach-at91/pm.c | 1 - trunk/arch/arm/mm/abort-ev6.S | 3 +- trunk/arch/arm/plat-s3c64xx/irq-eint.c | 2 +- trunk/arch/avr32/Kconfig | 2 +- .../arch/powerpc/platforms/86xx/gef_sbc610.c | 4 + trunk/arch/s390/crypto/aes_s390.c | 2 +- trunk/arch/x86/Kconfig | 5 +- trunk/arch/x86/include/asm/cacheflush.h | 53 +- trunk/arch/x86/include/asm/efi.h | 2 - trunk/arch/x86/include/asm/fixmap.h | 10 - trunk/arch/x86/include/asm/i387.h | 8 +- trunk/arch/x86/include/asm/io.h | 3 - trunk/arch/x86/include/asm/mmzone_32.h | 43 +- trunk/arch/x86/include/asm/percpu.h | 8 + trunk/arch/x86/include/asm/pgtable.h | 2 + trunk/arch/x86/include/asm/processor.h | 3 - trunk/arch/x86/include/asm/xen/page.h | 1 + trunk/arch/x86/kernel/cpu/common.c | 358 +++---- .../x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- trunk/arch/x86/kernel/efi.c | 7 +- trunk/arch/x86/kernel/efi_64.c | 21 +- trunk/arch/x86/kernel/i387.c | 2 +- trunk/arch/x86/kernel/irq_32.c | 29 +- trunk/arch/x86/kernel/process.c | 5 +- trunk/arch/x86/kernel/reboot.c | 8 + trunk/arch/x86/kernel/setup.c | 5 +- trunk/arch/x86/kernel/setup_percpu.c | 367 ++++++- trunk/arch/x86/kernel/visws_quirks.c | 2 +- trunk/arch/x86/math-emu/fpu_aux.c | 31 +- trunk/arch/x86/mm/init_32.c | 17 + trunk/arch/x86/mm/init_64.c | 72 +- trunk/arch/x86/mm/kmmio.c | 149 ++- trunk/arch/x86/mm/testmmiotrace.c | 70 +- trunk/arch/x86/xen/enlighten.c | 10 +- trunk/arch/x86/xen/mmu.c | 7 + trunk/arch/x86/xen/smp.c | 6 +- trunk/block/blktrace.c | 2 +- trunk/crypto/api.c | 15 +- trunk/drivers/acpi/processor_perflib.c | 4 +- trunk/drivers/crypto/ixp4xx_crypto.c | 6 +- trunk/drivers/crypto/padlock-aes.c | 2 +- trunk/drivers/crypto/padlock-sha.c | 4 +- trunk/drivers/dma/iop-adma.c | 2 +- trunk/drivers/dma/mv_xor.c | 2 +- trunk/drivers/gpu/drm/drm_stub.c | 2 +- trunk/drivers/i2c/busses/i2c-mv64xxx.c | 4 +- trunk/drivers/mtd/nand/orion_nand.c | 2 +- trunk/drivers/net/arm/Makefile | 2 +- trunk/drivers/net/arm/etherh.c | 10 +- trunk/drivers/video/pxafb.c | 2 +- trunk/include/linux/bootmem.h | 36 +- trunk/include/linux/percpu.h | 102 +- trunk/include/linux/rcuclassic.h | 6 + trunk/include/linux/rcupdate.h | 4 + trunk/include/linux/rcupreempt.h | 15 + trunk/include/linux/rcutree.h | 6 + trunk/include/linux/sched.h | 4 + trunk/include/linux/vmalloc.h | 4 + trunk/init/main.c | 3 +- trunk/kernel/module.c | 64 +- trunk/kernel/rcuclassic.c | 4 +- trunk/kernel/rcupdate.c | 12 + trunk/kernel/rcupreempt.c | 3 + trunk/kernel/rcutree.c | 4 +- trunk/kernel/sched.c | 21 +- trunk/kernel/stop_machine.c | 2 +- trunk/kernel/sys.c | 31 +- trunk/kernel/user.c | 18 + trunk/mm/Makefile | 4 + trunk/mm/allocpercpu.c | 32 +- trunk/mm/bootmem.c | 35 +- trunk/mm/percpu.c | 979 ++++++++++++++++++ trunk/mm/vmalloc.c | 94 +- trunk/net/ipv4/af_inet.c | 4 +- trunk/net/ipv4/route.c | 2 +- 79 files changed, 2306 insertions(+), 599 deletions(-) create mode 100644 trunk/mm/percpu.c diff --git a/[refs] b/[refs] index 29caf4c47a69..b740242e70a0 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 250981e6e1ef04947eccaa55e8c25ddcaa47a02e +refs/heads/master: a1413c89ae6a4b7a9a43f7768934a81ffb5c629a diff --git a/trunk/Makefile b/trunk/Makefile index 27fb890a2bff..c40d83aedebe 100644 --- a/trunk/Makefile +++ b/trunk/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 29 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Erotic Pickled Herring # *DOCUMENTATION* diff --git a/trunk/arch/alpha/mm/init.c b/trunk/arch/alpha/mm/init.c index 5d7a16eab312..af71d38c8e41 100644 --- a/trunk/arch/alpha/mm/init.c +++ b/trunk/arch/alpha/mm/init.c @@ -189,9 +189,21 @@ callback_init(void * kernel_end) if (alpha_using_srm) { static struct vm_struct console_remap_vm; - unsigned long vaddr = VMALLOC_START; + unsigned long nr_pages = 0; + unsigned long vaddr; unsigned long i, j; + /* calculate needed size */ + for (i = 0; i < crb->map_entries; ++i) + nr_pages += crb->map[i].count; + + /* register the vm area */ + console_remap_vm.flags = VM_ALLOC; + console_remap_vm.size = nr_pages << PAGE_SHIFT; + vm_area_register_early(&console_remap_vm, PAGE_SIZE); + + vaddr = (unsigned long)console_remap_vm.addr; + /* Set up the third level PTEs and update the virtual addresses of the CRB entries. */ for (i = 0; i < crb->map_entries; ++i) { @@ -213,12 +225,6 @@ callback_init(void * kernel_end) vaddr += PAGE_SIZE; } } - - /* Let vmalloc know that we've allocated some space. */ - console_remap_vm.flags = VM_ALLOC; - console_remap_vm.addr = (void *) VMALLOC_START; - console_remap_vm.size = vaddr - VMALLOC_START; - vmlist = &console_remap_vm; } callback_init_done = 1; diff --git a/trunk/arch/arm/kernel/setup.c b/trunk/arch/arm/kernel/setup.c index 7049815d66d5..68d6494c0389 100644 --- a/trunk/arch/arm/kernel/setup.c +++ b/trunk/arch/arm/kernel/setup.c @@ -233,12 +233,13 @@ static void __init cacheid_init(void) unsigned int cachetype = read_cpuid_cachetype(); unsigned int arch = cpu_architecture(); - if (arch >= CPU_ARCH_ARMv7) { - cacheid = CACHEID_VIPT_NONALIASING; - if ((cachetype & (3 << 14)) == 1 << 14) - cacheid |= CACHEID_ASID_TAGGED; - } else if (arch >= CPU_ARCH_ARMv6) { - if (cachetype & (1 << 23)) + if (arch >= CPU_ARCH_ARMv6) { + if ((cachetype & (7 << 29)) == 4 << 29) { + /* ARMv7 register format */ + cacheid = CACHEID_VIPT_NONALIASING; + if ((cachetype & (3 << 14)) == 1 << 14) + cacheid |= CACHEID_ASID_TAGGED; + } else if (cachetype & (1 << 23)) cacheid = CACHEID_VIPT_ALIASING; else cacheid = CACHEID_VIPT_NONALIASING; diff --git a/trunk/arch/arm/mach-at91/pm.c b/trunk/arch/arm/mach-at91/pm.c index 9bb4f043aa22..7ac812dc055a 100644 --- a/trunk/arch/arm/mach-at91/pm.c +++ b/trunk/arch/arm/mach-at91/pm.c @@ -332,7 +332,6 @@ static int at91_pm_enter(suspend_state_t state) at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR)); error: - sdram_selfrefresh_disable(); target_state = PM_SUSPEND_ON; at91_irq_resume(); at91_gpio_resume(); diff --git a/trunk/arch/arm/mm/abort-ev6.S b/trunk/arch/arm/mm/abort-ev6.S index 8a7f65ba14b7..94077fbd96b7 100644 --- a/trunk/arch/arm/mm/abort-ev6.S +++ b/trunk/arch/arm/mm/abort-ev6.S @@ -23,7 +23,8 @@ ENTRY(v6_early_abort) #ifdef CONFIG_CPU_32v6K clrex #else - strex r0, r1, [sp] @ Clear the exclusive monitor + sub r1, sp, #4 @ Get unused stack location + strex r0, r1, [r1] @ Clear the exclusive monitor #endif mrc p15, 0, r1, c5, c0, 0 @ get FSR mrc p15, 0, r0, c6, c0, 0 @ get FAR diff --git a/trunk/arch/arm/plat-s3c64xx/irq-eint.c b/trunk/arch/arm/plat-s3c64xx/irq-eint.c index 1f7cc0067f5c..ebb305ce7689 100644 --- a/trunk/arch/arm/plat-s3c64xx/irq-eint.c +++ b/trunk/arch/arm/plat-s3c64xx/irq-eint.c @@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq) u32 mask; mask = __raw_readl(S3C64XX_EINT0MASK); - mask |= eint_irq_to_bit(irq); + mask &= ~eint_irq_to_bit(irq); __raw_writel(mask, S3C64XX_EINT0MASK); } diff --git a/trunk/arch/avr32/Kconfig b/trunk/arch/avr32/Kconfig index b189680d18b0..05fe3053dcae 100644 --- a/trunk/arch/avr32/Kconfig +++ b/trunk/arch/avr32/Kconfig @@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt" config QUICKLIST def_bool y -config HAVE_ARCH_BOOTMEM_NODE +config HAVE_ARCH_BOOTMEM def_bool n config ARCH_HAVE_MEMORY_PRESENT diff --git a/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c b/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c index fb371f5ce132..d6b772ba3b8f 100644 --- a/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c +++ b/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c @@ -142,6 +142,10 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev) { unsigned int val; + /* Do not do the fixup on other platforms! */ + if (!machine_is(gef_sbc610)) + return; + printk(KERN_INFO "Running NEC uPD720101 Fixup\n"); /* Ensure ports 1, 2, 3, 4 & 5 are enabled */ diff --git a/trunk/arch/s390/crypto/aes_s390.c b/trunk/arch/s390/crypto/aes_s390.c index c42cd898f68b..6118890c946d 100644 --- a/trunk/arch/s390/crypto/aes_s390.c +++ b/trunk/arch/s390/crypto/aes_s390.c @@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void) module_init(aes_s390_init); module_exit(aes_s390_fini); -MODULE_ALIAS("aes"); +MODULE_ALIAS("aes-all"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig index 469f3450bf81..f5cef3fbf9a5 100644 --- a/trunk/arch/x86/Kconfig +++ b/trunk/arch/x86/Kconfig @@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config HAVE_DYNAMIC_PER_CPU_AREA + def_bool y + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP @@ -1125,7 +1128,7 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -config HAVE_ARCH_BOOTMEM_NODE +config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA diff --git a/trunk/arch/x86/include/asm/cacheflush.h b/trunk/arch/x86/include/asm/cacheflush.h index 2f8466540fb5..5b301b7ff5f4 100644 --- a/trunk/arch/x86/include/asm/cacheflush.h +++ b/trunk/arch/x86/include/asm/cacheflush.h @@ -5,24 +5,43 @@ #include /* Caches aren't brain-dead on the intel. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +static inline void flush_cache_all(void) { } +static inline void flush_cache_mm(struct mm_struct *mm) { } +static inline void flush_cache_dup_mm(struct mm_struct *mm) { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } +static inline void flush_cache_page(struct vm_area_struct *vma, + unsigned long vmaddr, unsigned long pfn) { } +static inline void flush_dcache_page(struct page *page) { } +static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } +static inline void flush_icache_range(unsigned long start, + unsigned long end) { } +static inline void flush_icache_page(struct vm_area_struct *vma, + struct page *page) { } +static inline void flush_icache_user_range(struct vm_area_struct *vma, + struct page *page, + unsigned long addr, + unsigned long len) { } +static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } +static inline void flush_cache_vunmap(unsigned long start, + unsigned long end) { } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) +static inline void copy_to_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} + +static inline void copy_from_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} #define PG_non_WB PG_arch_1 PAGEFLAG(NonWB, non_WB) diff --git a/trunk/arch/x86/include/asm/efi.h b/trunk/arch/x86/include/asm/efi.h index ca5ffb2856b6..edc90f23e708 100644 --- a/trunk/arch/x86/include/asm/efi.h +++ b/trunk/arch/x86/include/asm/efi.h @@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ -#define MAX_EFI_IO_PAGES 100 - extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/trunk/arch/x86/include/asm/fixmap.h b/trunk/arch/x86/include/asm/fixmap.h index dca8f03da5b2..63a79c77d220 100644 --- a/trunk/arch/x86/include/asm/fixmap.h +++ b/trunk/arch/x86/include/asm/fixmap.h @@ -24,9 +24,6 @@ #include #else #include -#ifdef CONFIG_EFI -#include -#endif #endif /* @@ -92,13 +89,6 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif -#ifdef CONFIG_X86_64 -#ifdef CONFIG_EFI - FIX_EFI_IO_MAP_LAST_PAGE, - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE - + MAX_EFI_IO_PAGES - 1, -#endif -#endif #ifdef CONFIG_X86_VISWS_APIC FIX_CO_CPU, /* Cobalt timer */ FIX_CO_APIC, /* Cobalt APIC Redirection Table */ diff --git a/trunk/arch/x86/include/asm/i387.h b/trunk/arch/x86/include/asm/i387.h index 48f0004db8c9..71c9e5183982 100644 --- a/trunk/arch/x86/include/asm/i387.h +++ b/trunk/arch/x86/include/asm/i387.h @@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk) #else /* CONFIG_X86_32 */ -extern void finit(void); +#ifdef CONFIG_MATH_EMULATION +extern void finit_task(struct task_struct *tsk); +#else +static inline void finit_task(struct task_struct *tsk) +{ +} +#endif static inline void tolerant_fwait(void) { diff --git a/trunk/arch/x86/include/asm/io.h b/trunk/arch/x86/include/asm/io.h index 683d0b4c00fc..e5383e3d2f8c 100644 --- a/trunk/arch/x86/include/asm/io.h +++ b/trunk/arch/x86/include/asm/io.h @@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); - #ifdef CONFIG_X86_32 # include "io_32.h" @@ -198,7 +196,6 @@ extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); extern void __iomem *early_memremap(unsigned long offset, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define IO_SPACE_LIMIT 0xffff diff --git a/trunk/arch/x86/include/asm/mmzone_32.h b/trunk/arch/x86/include/asm/mmzone_32.h index 105fb90a0635..ede6998bd92c 100644 --- a/trunk/arch/x86/include/asm/mmzone_32.h +++ b/trunk/arch/x86/include/asm/mmzone_32.h @@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn) #endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_NEED_MULTIPLE_NODES - -/* - * Following are macros that are specific to this numa platform. - */ -#define reserve_bootmem(addr, size, flags) \ - reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) -#define alloc_bootmem(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) -#define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define alloc_bootmem_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_low_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ -}) +/* always use node 0 for bootmem on this numa platform */ +#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ + (NODE_DATA(0)->bdata) #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/trunk/arch/x86/include/asm/percpu.h b/trunk/arch/x86/include/asm/percpu.h index aee103b26d01..8f1d2fbec1d4 100644 --- a/trunk/arch/x86/include/asm/percpu.h +++ b/trunk/arch/x86/include/asm/percpu.h @@ -43,6 +43,14 @@ #else /* ...!ASSEMBLY */ #include +#include + +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h index 1c097a3a6669..d0812e155f1d 100644 --- a/trunk/arch/x86/include/asm/pgtable.h +++ b/trunk/arch/x86/include/asm/pgtable.h @@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags, return 1; } +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 diff --git a/trunk/arch/x86/include/asm/processor.h b/trunk/arch/x86/include/asm/processor.h index dccef5be0fc1..76139506c3e4 100644 --- a/trunk/arch/x86/include/asm/processor.h +++ b/trunk/arch/x86/include/asm/processor.h @@ -391,9 +391,6 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union); DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); -DECLARE_PER_CPU(unsigned int, irq_count); -extern unsigned long kernel_eflags; -extern asmlinkage void ignore_sysret(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR DECLARE_PER_CPU(unsigned long, stack_canary); diff --git a/trunk/arch/x86/include/asm/xen/page.h b/trunk/arch/x86/include/asm/xen/page.h index 4bd990ee43df..1a918dde46b5 100644 --- a/trunk/arch/x86/include/asm/xen/page.h +++ b/trunk/arch/x86/include/asm/xen/page.h @@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) xmaddr_t arbitrary_virt_to_machine(void *address); +unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/trunk/arch/x86/kernel/cpu/common.c b/trunk/arch/x86/kernel/cpu/common.c index a9e3791ca098..826d5c876278 100644 --- a/trunk/arch/x86/kernel/cpu/common.c +++ b/trunk/arch/x86/kernel/cpu/common.c @@ -1,52 +1,52 @@ +#include +#include +#include +#include #include -#include #include -#include #include -#include -#include -#include -#include -#include #include +#include +#include #include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include #include -#include -#include -#include #include -#include #include +#include +#include #include +#include +#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "cpu.h" #ifdef CONFIG_X86_64 /* all of these masks are initialized in setup_cpu_local_masks() */ -cpumask_var_t cpu_initialized_mask; -cpumask_var_t cpu_callout_mask; cpumask_var_t cpu_callin_mask; +cpumask_var_t cpu_callout_mask; +cpumask_var_t cpu_initialized_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; @@ -62,10 +62,10 @@ void __init setup_cpu_local_masks(void) #else /* CONFIG_X86_32 */ -cpumask_t cpu_sibling_setup_map; +cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; cpumask_t cpu_initialized; -cpumask_t cpu_callin_map; +cpumask_t cpu_sibling_setup_map; #endif /* CONFIG_X86_32 */ @@ -79,48 +79,48 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout * - * TLS descriptors are currently at a different place compared to i386. + * The TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT #endif } }; @@ -164,17 +164,16 @@ static inline int flag_is_changeable_p(u32 flag) * the CPUID. Add "volatile" to not allow gcc to * optimize the subsequent calls to this function. */ - asm volatile ("pushfl \n\t" - "pushfl \n\t" - "popl %0 \n\t" - "movl %0, %1 \n\t" - "xorl %2, %0 \n\t" - "pushl %0 \n\t" - "popfl \n\t" - "pushfl \n\t" - "popl %0 \n\t" - "popfl \n\t" - + asm volatile ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" : "=&r" (f1), "=&r" (f2) : "ir" (flag)); @@ -189,22 +188,18 @@ static int __cpuinit have_cpuid_p(void) static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { - unsigned long lo, hi; - - if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) - return; - - /* Disable processor serial number: */ - - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { + /* Disable processor serial number */ + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } } static int __init x86_serial_nr_setup(char *s) @@ -237,7 +232,6 @@ struct cpuid_dependent_feature { u32 feature; u32 level; }; - static const struct cpuid_dependent_feature __cpuinitconst cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, 0x00000005 }, @@ -249,11 +243,7 @@ cpuid_dependent_features[] = { static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) { const struct cpuid_dependent_feature *df; - for (df = cpuid_dependent_features; df->feature; df++) { - - if (!cpu_has(c, df->feature)) - continue; /* * Note: cpuid_level is set to -1 if unavailable, but * extended_extended_level is set to 0 if unavailable @@ -261,26 +251,26 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) * when signed; hence the weird messing around with * signs here... */ - if (!((s32)df->level < 0 ? + if (cpu_has(c, df->feature) && + ((s32)df->level < 0 ? (u32)df->level > (u32)c->extended_cpuid_level : - (s32)df->level > (s32)c->cpuid_level)) - continue; - - clear_cpu_cap(c, df->feature); - if (!warn) - continue; - - printk(KERN_WARNING - "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", - x86_cap_flags[df->feature], df->level); + (s32)df->level > (s32)c->cpuid_level)) { + clear_cpu_cap(c, df->feature); + if (warn) + printk(KERN_WARNING + "CPU: CPU feature %s disabled " + "due to lack of CPUID level 0x%x\n", + x86_cap_flags[df->feature], + df->level); + } } } /* * Naming convention should be: [()] * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this - * isn't used + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * */ /* Look up CPU names by table lookup. */ @@ -317,10 +307,8 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. - */ +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr; @@ -366,24 +354,22 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) if (c->extended_cpuid_level < 0x80000004) return; - v = (unsigned int *)c->x86_model_id; + v = (unsigned int *) c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; - /* - * Intel chips right-justify this string for some dumb reason; - * undo that brain damage: - */ + /* Intel chips right-justify this string for some dumb reason; + undo that brain damage */ p = q = &c->x86_model_id[0]; while (*p == ' ') - p++; + p++; if (p != q) { - while (*p) - *q++ = *p++; - while (q <= &c->x86_model_id[48]) - *q++ = '\0'; /* Zero-pad the rest */ + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ } } @@ -452,30 +438,27 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - goto out; - } + } else if (smp_num_siblings > 1) { - if (smp_num_siblings <= 1) - goto out; - - if (smp_num_siblings > nr_cpu_ids) { - pr_warning("CPU: Unsupported number of siblings %d", - smp_num_siblings); - smp_num_siblings = 1; - return; - } + if (smp_num_siblings > nr_cpu_ids) { + printk(KERN_WARNING "CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); - smp_num_siblings = smp_num_siblings / c->x86_max_cores; + smp_num_siblings = smp_num_siblings / c->x86_max_cores; - index_msb = get_count_order(smp_num_siblings); + index_msb = get_count_order(smp_num_siblings); - core_bits = get_count_order(c->x86_max_cores); + core_bits = get_count_order(c->x86_max_cores); - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); + } out: if ((c->x86_max_cores * smp_num_siblings) > 1) { @@ -490,8 +473,8 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - static int printed; int i; + static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { if (!cpu_devs[i]) @@ -500,7 +483,6 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (!strcmp(v, cpu_devs[i]->c_ident[0]) || (cpu_devs[i]->c_ident[1] && !strcmp(v, cpu_devs[i]->c_ident[1]))) { - this_cpu = cpu_devs[i]; c->x86_vendor = this_cpu->c_x86_vendor; return; @@ -509,9 +491,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (!printed) { printed++; - printk(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n", v); - + printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } @@ -531,17 +511,14 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; - cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xf) << 4; - if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; c->x86_cache_alignment = c->x86_clflush_size; @@ -557,7 +534,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; @@ -566,7 +542,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { if (xlvl >= 0x80000001) { c->x86_capability[1] = cpuid_edx(0x80000001); @@ -662,7 +637,7 @@ void __init early_cpu_init(void) struct cpu_dev **cdev; int count = 0; - printk(KERN_INFO "KERNEL supported cpus:\n"); + printk("KERNEL supported cpus:\n"); for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { struct cpu_dev *cpudev = *cdev; unsigned int j; @@ -675,7 +650,7 @@ void __init early_cpu_init(void) for (j = 0; j < 2; j++) { if (!cpudev->c_ident[j]) continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + printk(" %s %s\n", cpudev->c_vendor, cpudev->c_ident[j]); } } @@ -784,8 +759,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) squash_the_stupid_serial_number(c); /* - * The vendor-specific functions might have changed features. - * Now we do "generic changes." + * The vendor-specific functions might have changed features. Now + * we do "generic changes." */ /* Filter out anything that depends on CPUID levels we don't have */ @@ -868,8 +843,8 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) } struct msr_range { - unsigned min; - unsigned max; + unsigned min; + unsigned max; }; static struct msr_range msr_range_array[] __cpuinitdata = { @@ -881,15 +856,14 @@ static struct msr_range msr_range_array[] __cpuinitdata = { static void __cpuinit print_cpu_msr(void) { - unsigned index_min, index_max; unsigned index; u64 val; int i; + unsigned index_min, index_max; for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { index_min = msr_range_array[i].min; index_max = msr_range_array[i].max; - for (index = index_min; index < index_max; index++) { if (rdmsrl_amd_safe(index, &val)) continue; @@ -899,7 +873,6 @@ static void __cpuinit print_cpu_msr(void) } static int show_msr __cpuinitdata; - static __init int setup_show_msr(char *arg) { int num; @@ -923,12 +896,10 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; - if (c->x86_vendor < X86_VENDOR_NUM) { + if (c->x86_vendor < X86_VENDOR_NUM) vendor = this_cpu->c_vendor; - } else { - if (c->cpuid_level >= 0) - vendor = c->x86_vendor_id; - } + else if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; if (vendor && !strstr(c->x86_model_id, vendor)) printk(KERN_CONT "%s ", vendor); @@ -955,12 +926,10 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) static __init int setup_disablecpuid(char *arg) { int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) setup_clear_cpu_cap(bit); else return 0; - return 1; } __setup("clearcpuid=", setup_disablecpuid); @@ -970,7 +939,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); - DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; @@ -980,21 +948,12 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack); DEFINE_PER_CPU(unsigned int, irq_count) = -1; -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) __aligned(PAGE_SIZE); +extern asmlinkage void ignore_sysret(void); + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1024,7 +983,7 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); -#else /* CONFIG_X86_64 */ +#else /* x86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, stack_canary); @@ -1036,26 +995,9 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) memset(regs, 0, sizeof(struct pt_regs)); regs->fs = __KERNEL_PERCPU; regs->gs = __KERNEL_STACK_CANARY; - return regs; } -#endif /* CONFIG_X86_64 */ - -/* - * Clear all 6 debug registers: - */ -static void clear_all_debug_regs(void) -{ - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - - set_debugreg(0, i); - } -} +#endif /* x86_64 */ /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -1065,20 +1007,15 @@ static void clear_all_debug_regs(void) * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 - void __cpuinit cpu_init(void) { - struct orig_ist *orig_ist; - struct task_struct *me; - struct tss_struct *t; + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); unsigned long v; - int cpu; + struct task_struct *me; int i; - cpu = stack_smp_processor_id(); - t = &per_cpu(init_tss, cpu); - orig_ist = &per_cpu(orig_ist, cpu); - #ifdef CONFIG_NUMA if (cpu != 0 && percpu_read(node_number) == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) @@ -1119,17 +1056,19 @@ void __cpuinit cpu_init(void) * set up and load the per-CPU TSS */ if (!orig_ist->ist[0]) { + static const unsigned int sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ + }; char *estacks = per_cpu(exception_stacks, cpu); - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; + estacks += sizes[v]; orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. @@ -1159,7 +1098,17 @@ void __cpuinit cpu_init(void) arch_kgdb_ops.correct_hw_break(); else #endif - clear_all_debug_regs(); + { + /* + * Clear all 6 debug registers: + */ + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); + } fpu_init(); @@ -1180,8 +1129,7 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) - local_irq_enable(); + for (;;) local_irq_enable(); } printk(KERN_INFO "Initializing CPU#%d\n", cpu); @@ -1211,7 +1159,13 @@ void __cpuinit cpu_init(void) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - clear_all_debug_regs(); + /* Clear all 6 debug registers: */ + set_debugreg(0, 0); + set_debugreg(0, 1); + set_debugreg(0, 2); + set_debugreg(0, 3); + set_debugreg(0, 6); + set_debugreg(0, 7); /* * Force FPU initialization: @@ -1231,4 +1185,6 @@ void __cpuinit cpu_init(void) xsave_init(); } + + #endif diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..22590cf688ae 100644 --- a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/trunk/arch/x86/kernel/efi.c b/trunk/arch/x86/kernel/efi.c index b205272ad394..1736acc4d7aa 100644 --- a/trunk/arch/x86/kernel/efi.c +++ b/trunk/arch/x86/kernel/efi.c @@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages; + u64 end, systab, addr, npages, end_pfn; void *p, *va; efi.systab = NULL; @@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - if (PFN_UP(end) <= max_low_pfn_mapped) + end_pfn = PFN_UP(end); + if (end_pfn <= max_low_pfn_mapped + || (end_pfn > (1UL << (32 - PAGE_SHIFT)) + && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/trunk/arch/x86/kernel/efi_64.c b/trunk/arch/x86/kernel/efi_64.c index a4ee29127fdf..22c3b7828c50 100644 --- a/trunk/arch/x86/kernel/efi_64.c +++ b/trunk/arch/x86/kernel/efi_64.c @@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void) void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { - static unsigned pages_mapped __initdata; - unsigned i, pages; - unsigned long offset; + unsigned long last_map_pfn; - pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - - if (pages_mapped + pages > MAX_EFI_IO_PAGES) + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; - for (i = 0; i < pages; i++) { - __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - phys_addr, PAGE_KERNEL); - phys_addr += PAGE_SIZE; - pages_mapped++; - } - - return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ - (pages_mapped - pages)) + offset; + return (void __iomem *)__va(phys_addr); } diff --git a/trunk/arch/x86/kernel/i387.c b/trunk/arch/x86/kernel/i387.c index b0f61f0dcd0a..f2f8540a7f3d 100644 --- a/trunk/arch/x86/kernel/i387.c +++ b/trunk/arch/x86/kernel/i387.c @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { memset(tsk->thread.xstate, 0, xstate_size); - finit(); + finit_task(tsk); set_stopped_child_used_math(tsk); return 0; } diff --git a/trunk/arch/x86/kernel/irq_32.c b/trunk/arch/x86/kernel/irq_32.c index 9dc6b2b24275..3b09634a5153 100644 --- a/trunk/arch/x86/kernel/irq_32.c +++ b/trunk/arch/x86/kernel/irq_32.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { } union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -}; +} __attribute__((aligned(PAGE_SIZE))); -static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; - if (hardirq_ctx[cpu]) + if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(hardirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - hardirq_ctx[cpu] = irqctx; + per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(softirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - softirq_ctx[cpu] = irqctx; + per_cpu(softirq_ctx, cpu) = irqctx; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); + cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } void irq_ctx_exit(int cpu) { - hardirq_ctx[cpu] = NULL; + per_cpu(hardirq_ctx, cpu) = NULL; } asmlinkage void do_softirq(void) @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; diff --git a/trunk/arch/x86/kernel/process.c b/trunk/arch/x86/kernel/process.c index 156f87582c6c..6afa5232dbb7 100644 --- a/trunk/arch/x86/kernel/process.c +++ b/trunk/arch/x86/kernel/process.c @@ -65,11 +65,11 @@ void exit_thread(void) { struct task_struct *me = current; struct thread_struct *t = &me->thread; - unsigned long *bp = t->io_bitmap_ptr; - if (bp) { + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); /* @@ -78,7 +78,6 @@ void exit_thread(void) memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); - kfree(bp); } ds_exit_thread(current); diff --git a/trunk/arch/x86/kernel/reboot.c b/trunk/arch/x86/kernel/reboot.c index 1cc18d439bbb..2aef36d8aca2 100644 --- a/trunk/arch/x86/kernel/reboot.c +++ b/trunk/arch/x86/kernel/reboot.c @@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, { } }; diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c index 4c54bc0d8ff3..b746deb9ebc6 100644 --- a/trunk/arch/x86/kernel/setup.c +++ b/trunk/arch/x86/kernel/setup.c @@ -770,6 +770,9 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + if (efi_enabled) + efi_init(); + dmi_scan_machine(); dmi_check_system(bad_bios_dmi_table); @@ -789,8 +792,6 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - if (efi_enabled) - efi_init(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { diff --git a/trunk/arch/x86/kernel/setup_percpu.c b/trunk/arch/x86/kernel/setup_percpu.c index d992e6cff730..c29f301d3885 100644 --- a/trunk/arch/x86/kernel/setup_percpu.c +++ b/trunk/arch/x86/kernel/setup_percpu.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} + +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * Remap allocator + * + * This allocator uses PMD page as unit. A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk. Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly. + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; + +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpur_size) + return NULL; + + return virt_to_page(pcpur_ptrs[cpu] + off); +} + +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + static struct vm_struct vm; + pg_data_t *last; + size_t ptrs_size; + unsigned int cpu; + ssize_t ret; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, on non-NUMA, embedding is better. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + last = NULL; + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + goto proceed; + + last = NODE_DATA(node); + } + return -EINVAL; + +proceed: + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + if (pcpur_size > PMD_SIZE) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + pcpur_ptrs = alloc_bootmem(ptrs_size); + + for_each_possible_cpu(cpu) { + pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpur_ptrs[cpu]) + goto enomem; + + /* + * Only use pcpur_size bytes and give back the rest. + * + * Ingo: The 2MB up-rounding bootmem is needed to make + * sure the partial 2MB page is still fully RAM - it's + * not well-specified to have a PAT-incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), + PMD_SIZE - pcpur_size); + + memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + } + + /* allocate address and map */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&vm, PMD_SIZE); + + for_each_possible_cpu(cpu) { + pmd_t *pmd; + + pmd = populate_extra_pmd((unsigned long)vm.addr + + cpu * PMD_SIZE); + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + PAGE_KERNEL_LARGE)); + } + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + pcpur_size - static_size, vm.addr, NULL); + goto out_free_ar; + +enomem: + for_each_possible_cpu(cpu) + if (pcpur_ptrs[cpu]) + free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpur_ptrs), ptrs_size); + return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + return -EINVAL; +} +#endif + +/* + * Embedding allocator + * + * The first chunk is sized to just contain the static area plus + * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using + * bootmem allocator and used as-is without being mapped into vmalloc + * area. This enables the first chunk to piggy back on the linear + * physical PMD mapping and doesn't add any additional pressure to + * TLB. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + + ((size_t)pageno << PAGE_SHIFT)); +} + +static ssize_t __init setup_pcpu_embed(size_t static_size) +{ + unsigned int cpu; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, embedding allocation doesn't play well with + * NUMA. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + /* allocate and copy */ + pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE); + if (!pcpue_ptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) + memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, + static_size); + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + pcpue_unit_size, + pcpue_unit_size - static_size, pcpue_ptr, + NULL); +} + +/* + * 4k page allocator + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + +static ssize_t __init setup_pcpu_4k(size_t static_size) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + * sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) + goto enomem; + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + pcpu4k_populate_pte); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + size_t static_size = __per_cpu_end - __per_cpu_start; + unsigned int cpu; + unsigned long delta; + size_t pcpu_unit_size; + ssize_t ret; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + /* + * Allocate percpu area. If PSE is supported, try to make use + * of large page mappings. Please read comments on top of + * each allocator for details. + */ + ret = setup_pcpu_remap(static_size); + if (ret < 0) + ret = setup_pcpu_embed(static_size); + if (ret < 0) + ret = setup_pcpu_4k(static_size); + if (ret < 0) + panic("cannot allocate static percpu area (%zu bytes, err=%zd)", + static_size, ret); - for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } -#endif + pcpu_unit_size = ret; - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); @@ -125,8 +438,6 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); - - DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ diff --git a/trunk/arch/x86/kernel/visws_quirks.c b/trunk/arch/x86/kernel/visws_quirks.c index 31ffc24eec4d..191a876e9e87 100644 --- a/trunk/arch/x86/kernel/visws_quirks.c +++ b/trunk/arch/x86/kernel/visws_quirks.c @@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = { static irqreturn_t piix4_master_intr(int irq, void *dev_id) { int realirq; - struct irq_desc *desc; + irq_desc_t *desc; unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); diff --git a/trunk/arch/x86/math-emu/fpu_aux.c b/trunk/arch/x86/math-emu/fpu_aux.c index 491e737ce547..aa0987088774 100644 --- a/trunk/arch/x86/math-emu/fpu_aux.c +++ b/trunk/arch/x86/math-emu/fpu_aux.c @@ -30,20 +30,29 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit(void) +void finit_task(struct task_struct *tsk) { - control_word = 0x037f; - partial_status = 0; - top = 0; /* We don't keep top in the status word internally. */ - fpu_tag_word = 0xffff; + struct i387_soft_struct *soft = &tsk->thread.xstate->soft; + struct address *oaddr, *iaddr; + soft->cwd = 0x037f; + soft->swd = 0; + soft->ftop = 0; /* We don't keep top in the status word internally. */ + soft->twd = 0xffff; /* The behaviour is different from that detailed in Section 15.1.6 of the Intel manual */ - operand_address.offset = 0; - operand_address.selector = 0; - instruction_address.offset = 0; - instruction_address.selector = 0; - instruction_address.opcode = 0; - no_ip_update = 1; + oaddr = (struct address *)&soft->foo; + oaddr->offset = 0; + oaddr->selector = 0; + iaddr = (struct address *)&soft->fip; + iaddr->offset = 0; + iaddr->selector = 0; + iaddr->opcode = 0; + soft->no_update = 1; +} + +void finit(void) +{ + finit_task(current); } /* diff --git a/trunk/arch/x86/mm/init_32.c b/trunk/arch/x86/mm/init_32.c index 0b087dcd2c18..47df0e1bbeb9 100644 --- a/trunk/arch/x86/mm/init_32.c +++ b/trunk/arch/x86/mm/init_32.c @@ -135,6 +135,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { diff --git a/trunk/arch/x86/mm/init_64.c b/trunk/arch/x86/mm/init_64.c index 724e537432e7..07f44d491df1 100644 --- a/trunk/arch/x86/mm/init_64.c +++ b/trunk/arch/x86/mm/init_64.c @@ -168,34 +168,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + /* * Create large page table mappings for a range of physical addresses. */ diff --git a/trunk/arch/x86/mm/kmmio.c b/trunk/arch/x86/mm/kmmio.c index 93d82038af4b..9f205030d9aa 100644 --- a/trunk/arch/x86/mm/kmmio.c +++ b/trunk/arch/x86/mm/kmmio.c @@ -32,11 +32,14 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ + bool old_presence; /* page presence prior to arming */ + bool armed; /* * Number of times this page has been registered as a part * of a probe. If zero, page is disarmed and this may be freed. - * Used only by writers (RCU). + * Used only by writers (RCU) and post_kmmio_handler(). + * Protected by kmmio_lock, when linked into kmmio_page_table. */ int count; }; @@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_page_present(unsigned long addr, bool present, - unsigned int *pglevel) +static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +{ + pmdval_t v = pmd_val(*pmd); + *old = !!(v & _PAGE_PRESENT); + v &= ~_PAGE_PRESENT; + if (present) + v |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(v)); +} + +static void set_pte_presence(pte_t *pte, bool present, bool *old) +{ + pteval_t v = pte_val(*pte); + *old = !!(v & _PAGE_PRESENT); + v &= ~_PAGE_PRESENT; + if (present) + v |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(v)); +} + +static int set_page_presence(unsigned long addr, bool present, bool *old) { - pteval_t pteval; - pmdval_t pmdval; unsigned int level; - pmd_t *pmd; pte_t *pte = lookup_address(addr, &level); if (!pte) { pr_err("kmmio: no pte for page 0x%08lx\n", addr); - return; + return -1; } - if (pglevel) - *pglevel = level; - switch (level) { case PG_LEVEL_2M: - pmd = (pmd_t *)pte; - pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; - if (present) - pmdval |= _PAGE_PRESENT; - set_pmd(pmd, __pmd(pmdval)); + set_pmd_presence((pmd_t *)pte, present, old); break; - case PG_LEVEL_4K: - pteval = pte_val(*pte) & ~_PAGE_PRESENT; - if (present) - pteval |= _PAGE_PRESENT; - set_pte_atomic(pte, __pte(pteval)); + set_pte_presence(pte, present, old); break; - default: pr_err("kmmio: unexpected page level 0x%x.\n", level); - return; + return -1; } __flush_tlb_one(addr); + return 0; } -/** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +/* + * Mark the given page as not present. Access to it will trigger a fault. + * + * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the + * protection is ignored here. RCU read lock is assumed held, so the struct + * will not disappear unexpectedly. Furthermore, the caller must guarantee, + * that double arming the same virtual address (page) cannot occur. + * + * Double disarming on the other hand is allowed, and may occur when a fault + * and mmiotrace shutdown happen simultaneously. + */ +static int arm_kmmio_fault_page(struct kmmio_fault_page *f) { - set_page_present(page & PAGE_MASK, false, pglevel); + int ret; + WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); + if (f->armed) { + pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, f->old_presence); + } + ret = set_page_presence(f->page, false, &f->old_presence); + WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); + f->armed = true; + return ret; } -/** Mark the given page as present. */ -static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +/** Restore the given page to saved presence state. */ +static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - set_page_present(page & PAGE_MASK, true, pglevel); + bool tmp; + int ret = set_page_presence(f->page, f->old_presence, &tmp); + WARN_ONCE(ret < 0, + KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + f->armed = false; } /* @@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - disarm_kmmio_fault_page(faultpage->page, NULL); if (addr == ctx->addr) { /* - * On SMP we sometimes get recursive probe hits on the - * same address. Context is already saved, fall out. + * A second fault on the same page means some other + * condition needs handling by do_page_fault(), the + * page really not being present is the most common. */ - pr_debug("kmmio: duplicate probe hit on CPU %d, for " - "address 0x%08lx.\n", - smp_processor_id(), addr); - ret = 1; - goto no_kmmio_ctx; - } - /* - * Prevent overwriting already in-flight context. - * This should not happen, let's hope disarming at least - * prevents a panic. - */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " + pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); + + if (!faultpage->old_presence) + pr_info("kmmio: unexpected secondary hit for " + "address 0x%08lx on CPU %d.\n", addr, + smp_processor_id()); + } else { + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at + * least prevents a panic. + */ + pr_emerg("kmmio: recursive probe hit on CPU %d, " "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); + disarm_kmmio_fault_page(faultpage); + } goto no_kmmio_ctx; } ctx->active++; @@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) regs->flags &= ~X86_EFLAGS_IF; /* Now we set present bit in PTE and single step. */ - disarm_kmmio_fault_page(ctx->fpage->page, NULL); + disarm_kmmio_fault_page(ctx->fpage); /* * If another cpu accesses the same page while we are stepping, @@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + pr_warning("kmmio: spurious debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - arm_kmmio_fault_page(ctx->fpage->page, NULL); + /* Prevent racing against release_kmmio_fault_page(). */ + spin_lock(&kmmio_lock); + if (ctx->fpage->count) + arm_kmmio_fault_page(ctx->fpage); + spin_unlock(&kmmio_lock); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; @@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page) f = get_kmmio_fault_page(page); if (f) { if (!f->count) - arm_kmmio_fault_page(f->page, NULL); + arm_kmmio_fault_page(f); f->count++; return 0; } - f = kmalloc(sizeof(*f), GFP_ATOMIC); + f = kzalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -1; f->count = 1; f->page = page; - list_add_rcu(&f->list, kmmio_page_list(f->page)); - arm_kmmio_fault_page(f->page, NULL); + if (arm_kmmio_fault_page(f)) { + kfree(f); + return -1; + } + + list_add_rcu(&f->list, kmmio_page_list(f->page)); return 0; } @@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page, f->count--; BUG_ON(f->count < 0); if (!f->count) { - disarm_kmmio_fault_page(f->page, NULL); + disarm_kmmio_fault_page(f); f->release_next = *release_list; *release_list = f; } diff --git a/trunk/arch/x86/mm/testmmiotrace.c b/trunk/arch/x86/mm/testmmiotrace.c index ab50a8d7402c..427fd1b56df5 100644 --- a/trunk/arch/x86/mm/testmmiotrace.c +++ b/trunk/arch/x86/mm/testmmiotrace.c @@ -1,5 +1,5 @@ /* - * Written by Pekka Paalanen, 2008 + * Written by Pekka Paalanen, 2008-2009 */ #include #include @@ -9,35 +9,74 @@ static unsigned long mmio_address; module_param(mmio_address, ulong, 0); -MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); +MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " + "(or 8 MB if read_far is non-zero)."); + +static unsigned long read_far = 0x400100; +module_param(read_far, ulong, 0); +MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " + "(default: 0x400100)."); + +static unsigned v16(unsigned i) +{ + return i * 12 + 7; +} + +static unsigned v32(unsigned i) +{ + return i * 212371 + 13; +} static void do_write_test(void __iomem *p) { unsigned int i; + pr_info(MODULE_NAME ": write test.\n"); mmiotrace_printk("Write test.\n"); + for (i = 0; i < 256; i++) iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) - iowrite16(i * 12 + 7, p + i); + iowrite16(v16(i), p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) - iowrite32(i * 212371 + 13, p + i); + iowrite32(v32(i), p + i); } static void do_read_test(void __iomem *p) { unsigned int i; + unsigned errs[3] = { 0 }; + pr_info(MODULE_NAME ": read test.\n"); mmiotrace_printk("Read test.\n"); + for (i = 0; i < 256; i++) - ioread8(p + i); + if (ioread8(p + i) != i) + ++errs[0]; + for (i = 1024; i < (5 * 1024); i += 2) - ioread16(p + i); + if (ioread16(p + i) != v16(i)) + ++errs[1]; + for (i = (5 * 1024); i < (16 * 1024); i += 4) - ioread32(p + i); + if (ioread32(p + i) != v32(i)) + ++errs[2]; + + mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", + errs[0], errs[1], errs[2]); } -static void do_test(void) +static void do_read_far_test(void __iomem *p) { - void __iomem *p = ioremap_nocache(mmio_address, 0x4000); + pr_info(MODULE_NAME ": read far test.\n"); + mmiotrace_printk("Read far test.\n"); + + ioread32(p + read_far); +} + +static void do_test(unsigned long size) +{ + void __iomem *p = ioremap_nocache(mmio_address, size); if (!p) { pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; @@ -45,11 +84,15 @@ static void do_test(void) mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); + if (read_far && read_far < size - 4) + do_read_far_test(p); iounmap(p); } static int __init init(void) { + unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + if (mmio_address == 0) { pr_err(MODULE_NAME ": you have to use the module argument " "mmio_address.\n"); @@ -58,10 +101,11 @@ static int __init init(void) return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " - "in PCI address space, and writing " - "rubbish in there.\n", mmio_address); - do_test(); + pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " + "address space, and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); + do_test(size); + pr_info(MODULE_NAME ": All done.\n"); return 0; } diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c index c52f4034c7fd..82cd39a6cbd3 100644 --- a/trunk/arch/x86/xen/enlighten.c +++ b/trunk/arch/x86/xen/enlighten.c @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = virt_to_mfn(vcpup); + info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", @@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); + frames[f] = arbitrary_virt_to_mfn((void *)va); + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(mfn_to_virt(frames[f])); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); @@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); struct multicall_space mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); @@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) diff --git a/trunk/arch/x86/xen/mmu.c b/trunk/arch/x86/xen/mmu.c index 319bd40a57c2..cb6afa4ec95c 100644 --- a/trunk/arch/x86/xen/mmu.c +++ b/trunk/arch/x86/xen/mmu.c @@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } +unsigned long arbitrary_virt_to_mfn(void *vaddr) +{ + xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); + + return PFN_DOWN(maddr.maddr); +} + xmaddr_t arbitrary_virt_to_machine(void *vaddr) { unsigned long address = (unsigned long)vaddr; diff --git a/trunk/arch/x86/xen/smp.c b/trunk/arch/x86/xen/smp.c index 035582ae815d..8d470562ffc9 100644 --- a/trunk/arch/x86/xen/smp.c +++ b/trunk/arch/x86/xen/smp.c @@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; struct desc_struct *gdt; + unsigned long gdt_mfn; if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; BUG_ON((unsigned long)gdt & ~PAGE_MASK); + + gdt_mfn = arbitrary_virt_to_mfn(gdt); make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; diff --git a/trunk/block/blktrace.c b/trunk/block/blktrace.c index 7cf9d1ff45a0..028120a0965a 100644 --- a/trunk/block/blktrace.c +++ b/trunk/block/blktrace.c @@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto err; diff --git a/trunk/crypto/api.c b/trunk/crypto/api.c index efe77df6863f..38a2bc02a98c 100644 --- a/trunk/crypto/api.c +++ b/trunk/crypto/api.c @@ -215,8 +215,19 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); type &= mask; - alg = try_then_request_module(crypto_alg_lookup(name, type, mask), - name); + alg = crypto_alg_lookup(name, type, mask); + if (!alg) { + char tmp[CRYPTO_MAX_ALG_NAME]; + + request_module(name); + + if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) && + snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp)) + request_module(tmp); + + alg = crypto_alg_lookup(name, type, mask); + } + if (alg) return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg; diff --git a/trunk/drivers/acpi/processor_perflib.c b/trunk/drivers/acpi/processor_perflib.c index 9cc769b587ff..68fd3d292799 100644 --- a/trunk/drivers/acpi/processor_perflib.c +++ b/trunk/drivers/acpi/processor_perflib.c @@ -516,12 +516,12 @@ int acpi_processor_preregister_performance( continue; } - if (!performance || !percpu_ptr(performance, i)) { + if (!performance || !per_cpu_ptr(performance, i)) { retval = -EINVAL; continue; } - pr->performance = percpu_ptr(performance, i); + pr->performance = per_cpu_ptr(performance, i); cpumask_set_cpu(i, pr->performance->shared_cpu_map); if (acpi_processor_get_psd(pr)) { retval = -EINVAL; diff --git a/trunk/drivers/crypto/ixp4xx_crypto.c b/trunk/drivers/crypto/ixp4xx_crypto.c index 2d637e0fbc03..d9e751be8c5f 100644 --- a/trunk/drivers/crypto/ixp4xx_crypto.c +++ b/trunk/drivers/crypto/ixp4xx_crypto.c @@ -457,10 +457,12 @@ static int init_ixp_crypto(void) if (!ctx_pool) { goto err; } - ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0); + ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0, + "ixp_crypto:out", NULL); if (ret) goto err; - ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0); + ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0, + "ixp_crypto:in", NULL); if (ret) { qmgr_release_queue(SEND_QID); goto err; diff --git a/trunk/drivers/crypto/padlock-aes.c b/trunk/drivers/crypto/padlock-aes.c index 856b3cc25583..3f0fdd18255d 100644 --- a/trunk/drivers/crypto/padlock-aes.c +++ b/trunk/drivers/crypto/padlock-aes.c @@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig"); -MODULE_ALIAS("aes"); +MODULE_ALIAS("aes-all"); diff --git a/trunk/drivers/crypto/padlock-sha.c b/trunk/drivers/crypto/padlock-sha.c index a7fbadebf623..a2c8e8514b63 100644 --- a/trunk/drivers/crypto/padlock-sha.c +++ b/trunk/drivers/crypto/padlock-sha.c @@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support."); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig"); -MODULE_ALIAS("sha1"); -MODULE_ALIAS("sha256"); +MODULE_ALIAS("sha1-all"); +MODULE_ALIAS("sha256-all"); MODULE_ALIAS("sha1-padlock"); MODULE_ALIAS("sha256-padlock"); diff --git a/trunk/drivers/dma/iop-adma.c b/trunk/drivers/dma/iop-adma.c index ea5440dd10dc..647374acba94 100644 --- a/trunk/drivers/dma/iop-adma.c +++ b/trunk/drivers/dma/iop-adma.c @@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma"); static struct platform_driver iop_adma_driver = { .probe = iop_adma_probe, - .remove = iop_adma_remove, + .remove = __devexit_p(iop_adma_remove), .driver = { .owner = THIS_MODULE, .name = "iop-adma", diff --git a/trunk/drivers/dma/mv_xor.c b/trunk/drivers/dma/mv_xor.c index d35cbd1ff0b3..5d5d5b31867f 100644 --- a/trunk/drivers/dma/mv_xor.c +++ b/trunk/drivers/dma/mv_xor.c @@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp, static struct platform_driver mv_xor_driver = { .probe = mv_xor_probe, - .remove = mv_xor_remove, + .remove = __devexit_p(mv_xor_remove), .driver = { .owner = THIS_MODULE, .name = MV_XOR_NAME, diff --git a/trunk/drivers/gpu/drm/drm_stub.c b/trunk/drivers/gpu/drm/drm_stub.c index 096e2a37446d..7c8b15b22bf2 100644 --- a/trunk/drivers/gpu/drm/drm_stub.c +++ b/trunk/drivers/gpu/drm/drm_stub.c @@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data, file_priv->minor->master != file_priv->master) { mutex_lock(&dev->struct_mutex); file_priv->minor->master = drm_master_get(file_priv->master); - mutex_lock(&dev->struct_mutex); + mutex_unlock(&dev->struct_mutex); } return 0; diff --git a/trunk/drivers/i2c/busses/i2c-mv64xxx.c b/trunk/drivers/i2c/busses/i2c-mv64xxx.c index eeda276f8f16..7f186bbcb99d 100644 --- a/trunk/drivers/i2c/busses/i2c-mv64xxx.c +++ b/trunk/drivers/i2c/busses/i2c-mv64xxx.c @@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd, return 0; } -static void __devexit +static void mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data) { if (drv_data->reg_base) { @@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev) static struct platform_driver mv64xxx_i2c_driver = { .probe = mv64xxx_i2c_probe, - .remove = mv64xxx_i2c_remove, + .remove = __devexit_p(mv64xxx_i2c_remove), .driver = { .owner = THIS_MODULE, .name = MV64XXX_I2C_CTLR_NAME, diff --git a/trunk/drivers/mtd/nand/orion_nand.c b/trunk/drivers/mtd/nand/orion_nand.c index 917cf8d3ae95..c2dfd3ea353d 100644 --- a/trunk/drivers/mtd/nand/orion_nand.c +++ b/trunk/drivers/mtd/nand/orion_nand.c @@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev) static struct platform_driver orion_nand_driver = { .probe = orion_nand_probe, - .remove = orion_nand_remove, + .remove = __devexit_p(orion_nand_remove), .driver = { .name = "orion_nand", .owner = THIS_MODULE, diff --git a/trunk/drivers/net/arm/Makefile b/trunk/drivers/net/arm/Makefile index c69c0cdba4a2..811a3ccd14c1 100644 --- a/trunk/drivers/net/arm/Makefile +++ b/trunk/drivers/net/arm/Makefile @@ -4,7 +4,7 @@ # obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o -obj-$(CONFIG_ARM_ETHERH) += etherh.o ../8390.o +obj-$(CONFIG_ARM_ETHERH) += etherh.o obj-$(CONFIG_ARM_ETHER3) += ether3.o obj-$(CONFIG_ARM_ETHER1) += ether1.o obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o diff --git a/trunk/drivers/net/arm/etherh.c b/trunk/drivers/net/arm/etherh.c index 54b52e5b1821..f52f668c49bf 100644 --- a/trunk/drivers/net/arm/etherh.c +++ b/trunk/drivers/net/arm/etherh.c @@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = { .ndo_open = etherh_open, .ndo_stop = etherh_close, .ndo_set_config = etherh_set_config, - .ndo_start_xmit = ei_start_xmit, - .ndo_tx_timeout = ei_tx_timeout, - .ndo_get_stats = ei_get_stats, - .ndo_set_multicast_list = ei_set_multicast_list, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_multicast_list = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ei_poll, + .ndo_poll_controller = __ei_poll, #endif }; diff --git a/trunk/drivers/video/pxafb.c b/trunk/drivers/video/pxafb.c index 48ff701d3a72..2552b9f325ee 100644 --- a/trunk/drivers/video/pxafb.c +++ b/trunk/drivers/video/pxafb.c @@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev) static struct platform_driver pxafb_driver = { .probe = pxafb_probe, - .remove = pxafb_remove, + .remove = __devexit_p(pxafb_remove), .suspend = pxafb_suspend, .resume = pxafb_resume, .driver = { diff --git a/trunk/include/linux/bootmem.h b/trunk/include/linux/bootmem.h index 95837bfb5256..455d83219fae 100644 --- a/trunk/include/linux/bootmem.h +++ b/trunk/include/linux/bootmem.h @@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size); #define BOOTMEM_DEFAULT 0 #define BOOTMEM_EXCLUSIVE (1<<0) +extern int reserve_bootmem(unsigned long addr, + unsigned long size, + int flags); extern int reserve_bootmem_node(pg_data_t *pgdat, - unsigned long physaddr, - unsigned long size, - int flags); -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); -#endif + unsigned long physaddr, + unsigned long size, + int flags); -extern void *__alloc_bootmem_nopanic(unsigned long size, +extern void *__alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal); -extern void *__alloc_bootmem(unsigned long size, +extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal); -extern void *__alloc_bootmem_low(unsigned long size, - unsigned long align, - unsigned long goal); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +extern void *__alloc_bootmem_low(unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE + #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_nopanic(x) \ __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_nopanic(x) \ __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_node_nopanic(pgdat, x) \ + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + +#define alloc_bootmem_low(x) \ + __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, int flags); diff --git a/trunk/include/linux/percpu.h b/trunk/include/linux/percpu.h index 3577ffd90d45..545b068bcb70 100644 --- a/trunk/include/linux/percpu.h +++ b/trunk/include/linux/percpu.h @@ -76,52 +76,98 @@ #ifdef CONFIG_SMP +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +/* minimum unit size, also is the maximum supported allocation size */ +#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT) + +/* + * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy + * back on the first chunk if arch is manually allocating and mapping + * it for faster access (as a part of large page mapping for example). + * Note that dynamic percpu allocator covers both static and dynamic + * areas, so these values are bigger than PERCPU_MODULE_RESERVE. + * + * On typical configuration with modules, the following values leave + * about 8k of free space on the first chunk after boot on both x86_32 + * and 64 when module support is enabled. When module support is + * disabled, it's much tighter. + */ +#ifndef PERCPU_DYNAMIC_RESERVE +# if BITS_PER_LONG > 32 +# ifdef CONFIG_MODULES +# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT) +# else +# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) +# endif +# else +# ifdef CONFIG_MODULES +# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) +# else +# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT) +# endif +# endif +#endif /* PERCPU_DYNAMIC_RESERVE */ + +extern void *pcpu_base_addr; + +typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); +typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); + +extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + size_t static_size, size_t unit_size, + size_t free_size, void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn); + +/* + * Use this to get to a cpu's version of the per-cpu object + * dynamically allocated. Non-atomic access to the current CPU's + * version should probably be combined with get_cpu()/put_cpu(). + */ +#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) + +#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + struct percpu_data { void *ptrs[1]; }; #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) -/* - * Use this to get to a cpu's version of the per-cpu object dynamically - * allocated. Non-atomic access to the current CPU's version should - * probably be combined with get_cpu()/put_cpu(). - */ -#define percpu_ptr(ptr, cpu) \ -({ \ - struct percpu_data *__p = __percpu_disguise(ptr); \ - (__typeof__(ptr))__p->ptrs[(cpu)]; \ + +#define per_cpu_ptr(ptr, cpu) \ +({ \ + struct percpu_data *__p = __percpu_disguise(ptr); \ + (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); -extern void percpu_free(void *__pdata); +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + +extern void *__alloc_percpu(size_t size, size_t align); +extern void free_percpu(void *__pdata); #else /* CONFIG_SMP */ -#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) -static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +static inline void *__alloc_percpu(size_t size, size_t align) { - return kzalloc(size, gfp); + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > SMP_CACHE_BYTES); + return kzalloc(size, GFP_KERNEL); } -static inline void percpu_free(void *__pdata) +static inline void free_percpu(void *p) { - kfree(__pdata); + kfree(p); } #endif /* CONFIG_SMP */ -#define percpu_alloc_mask(size, gfp, mask) \ - __percpu_alloc_mask((size), (gfp), &(mask)) - -#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map) - -/* (legacy) interface for use without CPU hotplug handling */ - -#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \ - cpu_possible_map) -#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) -#define free_percpu(ptr) percpu_free((ptr)) -#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) +#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) #endif /* __LINUX_PERCPU_H */ diff --git a/trunk/include/linux/rcuclassic.h b/trunk/include/linux/rcuclassic.h index f3f697df1d71..80044a4f3ab9 100644 --- a/trunk/include/linux/rcuclassic.h +++ b/trunk/include/linux/rcuclassic.h @@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void); #define rcu_enter_nohz() do { } while (0) #define rcu_exit_nohz() do { } while (0) +/* A context switch is a grace period for rcuclassic. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUCLASSIC_H */ diff --git a/trunk/include/linux/rcupdate.h b/trunk/include/linux/rcupdate.h index 921340a7b71c..528343e6da51 100644 --- a/trunk/include/linux/rcupdate.h +++ b/trunk/include/linux/rcupdate.h @@ -52,6 +52,9 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; +/* Internal to kernel, but needed by rcupreempt.h. */ +extern int rcu_scheduler_active; + #if defined(CONFIG_CLASSIC_RCU) #include #elif defined(CONFIG_TREE_RCU) @@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void); /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_scheduler_starting(void); extern int rcu_needs_cpu(int cpu); #endif /* __LINUX_RCUPDATE_H */ diff --git a/trunk/include/linux/rcupreempt.h b/trunk/include/linux/rcupreempt.h index 3e05c09b54a2..74304b4538d8 100644 --- a/trunk/include/linux/rcupreempt.h +++ b/trunk/include/linux/rcupreempt.h @@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void) #define rcu_exit_nohz() do { } while (0) #endif /* CONFIG_NO_HZ */ +/* + * A context switch is a grace period for rcupreempt synchronize_rcu() + * only during early boot, before the scheduler has been initialized. + * So, how the heck do we get a context switch? Well, if the caller + * invokes synchronize_rcu(), they are willing to accept a context + * switch, so we simply pretend that one happened. + * + * After boot, there might be a blocked or preempted task in an RCU + * read-side critical section, so we cannot then take the fastpath. + */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1 && !rcu_scheduler_active; +} + #endif /* __LINUX_RCUPREEMPT_H */ diff --git a/trunk/include/linux/rcutree.h b/trunk/include/linux/rcutree.h index d4368b7975c3..a722fb67bb2d 100644 --- a/trunk/include/linux/rcutree.h +++ b/trunk/include/linux/rcutree.h @@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void) } #endif /* CONFIG_NO_HZ */ +/* A context switch is a grace period for rcutree. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUTREE_H */ diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index f0a50b20e8a0..a7c7698583bb 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -2303,9 +2303,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif +extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/trunk/include/linux/vmalloc.h b/trunk/include/linux/vmalloc.h index 9c0890c7a06a..a43ebec3a7b9 100644 --- a/trunk/include/linux/vmalloc.h +++ b/trunk/include/linux/vmalloc.h @@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); +extern int map_kernel_range_noflush(unsigned long start, unsigned long size, + pgprot_t prot, struct page **pages); +extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); /* Allocate/destroy a 'vmalloc' VM area. */ @@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); */ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; +extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #endif /* _LINUX_VMALLOC_H */ diff --git a/trunk/init/main.c b/trunk/init/main.c index 6441083f8273..6bf83afd654d 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -98,7 +98,7 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif -enum system_states system_state; +enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); /* @@ -464,6 +464,7 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); + rcu_scheduler_starting(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/kernel/module.c b/trunk/kernel/module.c index ba22484a987e..1f0657ae555b 100644 --- a/trunk/kernel/module.c +++ b/trunk/kernel/module.c @@ -51,6 +51,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -366,6 +367,34 @@ static struct module *find_module(const char *name) } #ifdef CONFIG_SMP + +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) +{ + void *ptr; + + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; + } + + ptr = __alloc_percpu(size, align); + if (!ptr) + printk(KERN_WARNING + "Could not allocate %lu bytes percpu data\n", size); + return ptr; +} + +static void percpu_modfree(void *freeme) +{ + free_percpu(freeme); +} + +#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; /* Size of each block. -ve means used. */ @@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme) } } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} - static int percpu_modinit(void) { pcpu_num_used = 2; @@ -513,7 +527,26 @@ static int percpu_modinit(void) return 0; } __initcall(percpu_modinit); + +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + +static unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} + #else /* ... !CONFIG_SMP */ + static inline void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ diff --git a/trunk/kernel/rcuclassic.c b/trunk/kernel/rcuclassic.c index bd5a9003497c..654c640a6b9c 100644 --- a/trunk/kernel/rcuclassic.c +++ b/trunk/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/trunk/kernel/rcupdate.c b/trunk/kernel/rcupdate.c index d92a76a881aa..cae8a059cf47 100644 --- a/trunk/kernel/rcupdate.c +++ b/trunk/kernel/rcupdate.c @@ -44,6 +44,7 @@ #include #include #include +#include enum rcu_barrier { RCU_BARRIER_STD, @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; +int rcu_scheduler_active __read_mostly; /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head) void synchronize_rcu(void) { struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); @@ -175,3 +181,9 @@ void __init rcu_init(void) __rcu_init(); } +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} diff --git a/trunk/kernel/rcupreempt.c b/trunk/kernel/rcupreempt.c index 33cfc50781f9..5d59e850fb71 100644 --- a/trunk/kernel/rcupreempt.c +++ b/trunk/kernel/rcupreempt.c @@ -1181,6 +1181,9 @@ void __synchronize_sched(void) { struct rcu_synchronize rcu; + if (num_online_cpus() == 1) + return; /* blocking is gp if only one CPU! */ + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/trunk/kernel/rcutree.c b/trunk/kernel/rcutree.c index b2fd602a6f6f..97ce31579ec0 100644 --- a/trunk/kernel/rcutree.c +++ b/trunk/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index 7d97ff7c4478..0a76d0b6f215 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -9219,6 +9219,16 @@ static int sched_rt_global_constraints(void) return ret; } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9312,8 +9322,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -9476,7 +9485,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9495,7 +9504,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9591,7 +9600,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/trunk/kernel/stop_machine.c b/trunk/kernel/stop_machine.c index 0cd415ee62a2..74541ca49536 100644 --- a/trunk/kernel/stop_machine.c +++ b/trunk/kernel/stop_machine.c @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) * doesn't hit this CPU until we're ready. */ get_cpu(); for_each_online_cpu(i) { - sm_work = percpu_ptr(stop_machine_work, i); + sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } diff --git a/trunk/kernel/sys.c b/trunk/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/trunk/kernel/sys.c +++ b/trunk/kernel/sys.c @@ -559,7 +559,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,6 +571,11 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - retval = -EAGAIN; - if (new->uid != old->uid && set_user(new) < 0) - goto error; - + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid && set_user(new) < 0) { - retval = -EAGAIN; - goto error; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } - retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid && set_user(new) < 0) - goto error; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } } if (euid != (uid_t) -1) new->euid = euid; diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/trunk/kernel/user.c +++ b/trunk/kernel/user.c @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + + return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + return 1; +} +#endif + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile index 72255be57f89..818569b68f46 100644 --- a/trunk/mm/Makefile +++ b/trunk/mm/Makefile @@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o +ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +obj-$(CONFIG_SMP) += percpu.o +else obj-$(CONFIG_SMP) += allocpercpu.o +endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/trunk/mm/allocpercpu.c b/trunk/mm/allocpercpu.c index 4297bc41bfd2..3653c570232b 100644 --- a/trunk/mm/allocpercpu.c +++ b/trunk/mm/allocpercpu.c @@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** - * percpu_alloc_mask - initial setup of per-cpu data + * alloc_percpu - initial setup of per-cpu data * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-data for cpu's selected through mask bits + * @align: alignment * - * Populating per-cpu data for all online cpu's would be a typical use case, - * which is simplified by the percpu_alloc() wrapper. - * Per-cpu objects are populated with zeroed buffers. + * Allocate dynamic percpu area. Percpu objects are populated with + * zeroed buffers. */ -void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +void *__alloc_percpu(size_t size, size_t align) { /* * We allocate whole cache lines to avoid false sharing */ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, gfp); + void *pdata = kzalloc(sz, GFP_KERNEL); void *__pdata = __percpu_disguise(pdata); + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > __alignof__(unsigned long long)); + if (unlikely(!pdata)) return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) + if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, + &cpu_possible_map))) return __pdata; kfree(pdata); return NULL; } -EXPORT_SYMBOL_GPL(__percpu_alloc_mask); +EXPORT_SYMBOL_GPL(__alloc_percpu); /** - * percpu_free - final cleanup of per-cpu data + * free_percpu - final cleanup of per-cpu data * @__pdata: object to clean up * * We simply clean up any per-cpu object left. No need for the client to * track and specify through a bis mask which per-cpu objects are to free. */ -void percpu_free(void *__pdata) +void free_percpu(void *__pdata) { if (unlikely(!__pdata)) return; __percpu_depopulate_mask(__pdata, &cpu_possible_map); kfree(__percpu_disguise(__pdata)); } -EXPORT_SYMBOL_GPL(percpu_free); +EXPORT_SYMBOL_GPL(free_percpu); diff --git a/trunk/mm/bootmem.c b/trunk/mm/bootmem.c index 51a0ccf61e0e..daf92713f7de 100644 --- a/trunk/mm/bootmem.c +++ b/trunk/mm/bootmem.c @@ -382,7 +382,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); } -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE /** * reserve_bootmem - mark a page range as usable * @addr: starting address of the range @@ -403,7 +402,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) @@ -429,8 +427,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, } static void * __init alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; @@ -530,17 +528,34 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, return NULL; } +static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ +#ifdef CONFIG_HAVE_ARCH_BOOTMEM + bootmem_data_t *p_bdata; + + p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); + if (p_bdata) + return alloc_bootmem_core(p_bdata, size, align, goal, limit); +#endif + return NULL; +} + static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { bootmem_data_t *bdata; + void *region; restart: - list_for_each_entry(bdata, &bdata_list, list) { - void *region; + region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); + if (region) + return region; + list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) @@ -618,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, { void *ptr; + ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -674,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; diff --git a/trunk/mm/percpu.c b/trunk/mm/percpu.c new file mode 100644 index 000000000000..3d0f5456827c --- /dev/null +++ b/trunk/mm/percpu.c @@ -0,0 +1,979 @@ +/* + * linux/mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * This is percpu allocator which can handle both static and dynamic + * areas. Percpu areas are allocated in chunks in vmalloc area. Each + * chunk is consisted of num_possible_cpus() units and the first chunk + * is used for static percpu variables in the kernel image (special + * boot time alloc/init handling necessary as these areas need to be + * brought up before allocation services are running). Unit grows as + * necessary and all units grow or shrink in unison. When a chunk is + * filled up, another chunk is allocated. ie. in vmalloc area + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done in offset-size areas of single unit space. Ie, + * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, + * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring + * percpu base registers UNIT_SIZE apart. + * + * There are usually many small percpu allocations many of them as + * small as 4 bytes. The allocator organizes chunks into lists + * according to free size and tries to allocate from the fullest one. + * Each chunk keeps the maximum contiguous area size hint which is + * guaranteed to be eqaul to or larger than the maximum contiguous + * area in the chunk. This helps the allocator not to iterate the + * chunk maps unnecessarily. + * + * Allocation state in each chunk is kept using an array of integers + * on chunk->map. A positive value in the map represents a free + * region and negative allocated. Allocation inside a chunk is done + * by scanning this map sequentially and serving the first matching + * entry. This is mostly copied from the percpu_modalloc() allocator. + * Chunks are also linked into a rb tree to ease address to chunk + * mapping during free. + * + * To use this allocator, arch code should do the followings. + * + * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back + * + * - use pcpu_setup_first_chunk() during percpu area initialization to + * setup the first chunk containing the kernel static percpu area + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ +#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ + +struct pcpu_chunk { + struct list_head list; /* linked to pcpu_slot lists */ + struct rb_node rb_node; /* key is chunk->vm->addr */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + struct vm_struct *vm; /* mapped vmalloc region */ + int map_used; /* # of map entries used */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + bool immutable; /* no [de]population allowed */ + struct page *page[]; /* #cpus * UNIT_PAGES */ +}; + +static int pcpu_unit_pages __read_mostly; +static int pcpu_unit_size __read_mostly; +static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_slots __read_mostly; +static size_t pcpu_chunk_struct_size __read_mostly; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr __read_mostly; +EXPORT_SYMBOL_GPL(pcpu_base_addr); + +/* the size of kernel static area */ +static int pcpu_static_size __read_mostly; + +/* + * One mutex to rule them all. + * + * The following mutex is grabbed in the outermost public alloc/free + * interface functions and released only when the operation is + * complete. As such, every function in this file other than the + * outermost functions are called under pcpu_mutex. + * + * It can easily be switched to use spinlock such that only the area + * allocation and page population commit are protected with it doing + * actual [de]allocation without holding any lock. However, given + * what this allocator does, I think it's better to let them run + * sequentially. + */ +static DEFINE_MUTEX(pcpu_mutex); + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ + +static int __pcpu_size_to_slot(int size) +{ + int highbit = fls(size); /* size is in bytes */ + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_size_to_slot(int size) +{ + if (size == pcpu_unit_size) + return pcpu_nr_slots - 1; + return __pcpu_size_to_slot(size); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + return 0; + + return pcpu_size_to_slot(chunk->free_size); +} + +static int pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return cpu * pcpu_unit_pages + page_idx; +} + +static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return &chunk->page[pcpu_page_idx(cpu, page_idx)]; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->vm->addr + + (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); +} + +static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, + int page_idx) +{ + return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; +} + +/** + * pcpu_realloc - versatile realloc + * @p: the current pointer (can be NULL for new allocations) + * @size: the current size in bytes (can be 0 for new allocations) + * @new_size: the wanted new size in bytes (can be 0 for free) + * + * More robust realloc which can be used to allocate, resize or free a + * memory area of arbitrary size. If the needed size goes over + * PAGE_SIZE, kernel VM is used. + * + * RETURNS: + * The new pointer on success, NULL on failure. + */ +static void *pcpu_realloc(void *p, size_t size, size_t new_size) +{ + void *new; + + if (new_size <= PAGE_SIZE) + new = kmalloc(new_size, GFP_KERNEL); + else + new = vmalloc(new_size); + if (new_size && !new) + return NULL; + + memcpy(new, p, min(size, new_size)); + if (new_size > size) + memset(new + size, 0, new_size - size); + + if (size <= PAGE_SIZE) + kfree(p); + else + vfree(p); + + return new; +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + if (oslot != nslot) { + if (oslot < nslot) + list_move(&chunk->list, &pcpu_slot[nslot]); + else + list_move_tail(&chunk->list, &pcpu_slot[nslot]); + } +} + +static struct rb_node **pcpu_chunk_rb_search(void *addr, + struct rb_node **parentp) +{ + struct rb_node **p = &pcpu_addr_root.rb_node; + struct rb_node *parent = NULL; + struct pcpu_chunk *chunk; + + while (*p) { + parent = *p; + chunk = rb_entry(parent, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) + p = &(*p)->rb_left; + else if (addr > chunk->vm->addr) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * pcpu_chunk_addr_search - search for chunk containing specified address + * @addr: address to search for + * + * Look for chunk which might contain @addr. More specifically, it + * searchs for the chunk with the highest start address which isn't + * beyond @addr. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + struct rb_node *n, *parent; + struct pcpu_chunk *chunk; + + n = *pcpu_chunk_rb_search(addr, &parent); + if (!n) { + /* no exactly matching chunk, the parent is the closest */ + n = parent; + BUG_ON(!n); + } + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) { + /* the parent was the next one, look for the previous one */ + n = rb_prev(n); + BUG_ON(!n); + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + } + + return chunk; +} + +/** + * pcpu_chunk_addr_insert - insert chunk into address rb tree + * @new: chunk to insert + * + * Insert @new into address rb tree. + */ +static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) +{ + struct rb_node **p, *parent; + + p = pcpu_chunk_rb_search(new->vm->addr, &parent); + BUG_ON(*p); + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, &pcpu_addr_root); +} + +/** + * pcpu_split_block - split a map block + * @chunk: chunk of interest + * @i: index of map block to split + * @head: head size in bytes (can be 0) + * @tail: tail size in bytes (can be 0) + * + * Split the @i'th map block into two or three blocks. If @head is + * non-zero, @head bytes block is inserted before block @i moving it + * to @i+1 and reducing its size by @head bytes. + * + * If @tail is non-zero, the target block, which can be @i or @i+1 + * depending on @head, is reduced by @tail bytes and @tail byte block + * is inserted after the target block. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) +{ + int nr_extra = !!head + !!tail; + int target = chunk->map_used + nr_extra; + + /* reallocation required? */ + if (chunk->map_alloc < target) { + int new_alloc = chunk->map_alloc; + int *new; + + while (new_alloc < target) + new_alloc *= 2; + + new = pcpu_realloc(chunk->map, + chunk->map_alloc * sizeof(new[0]), + new_alloc * sizeof(new[0])); + if (!new) + return -ENOMEM; + + chunk->map_alloc = new_alloc; + chunk->map = new; + } + + /* insert a new subblock */ + memmove(&chunk->map[i + nr_extra], &chunk->map[i], + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + + if (head) { + chunk->map[i + 1] = chunk->map[i] - head; + chunk->map[i++] = head; + } + if (tail) { + chunk->map[i++] -= tail; + chunk->map[i] = tail; + } + return 0; +} + +/** + * pcpu_alloc_area - allocate area from a pcpu_chunk + * @chunk: chunk of interest + * @size: wanted size in bytes + * @align: wanted align + * + * Try to allocate @size bytes area aligned at @align from @chunk. + * Note that this function only allocates the offset. It doesn't + * populate or map the area. + * + * RETURNS: + * Allocated offset in @chunk on success, -errno on failure. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +{ + int oslot = pcpu_chunk_slot(chunk); + int max_contig = 0; + int i, off; + + /* + * The static chunk initially doesn't have map attached + * because kmalloc wasn't available during init. Give it one. + */ + if (unlikely(!chunk->map)) { + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + if (!chunk->map) + return -ENOMEM; + + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = -pcpu_static_size; + if (chunk->free_size) + chunk->map[chunk->map_used++] = chunk->free_size; + } + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { + bool is_last = i + 1 == chunk->map_used; + int head, tail; + + /* extra for alignment requirement */ + head = ALIGN(off, align) - off; + BUG_ON(i == 0 && head != 0); + + if (chunk->map[i] < 0) + continue; + if (chunk->map[i] < head + size) { + max_contig = max(chunk->map[i], max_contig); + continue; + } + + /* + * If head is small or the previous block is free, + * merge'em. Note that 'small' is defined as smaller + * than sizeof(int), which is very small but isn't too + * uncommon for percpu allocations. + */ + if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { + if (chunk->map[i - 1] > 0) + chunk->map[i - 1] += head; + else { + chunk->map[i - 1] -= head; + chunk->free_size -= head; + } + chunk->map[i] -= head; + off += head; + head = 0; + } + + /* if tail is small, just keep it around */ + tail = chunk->map[i] - head - size; + if (tail < sizeof(int)) + tail = 0; + + /* split if warranted */ + if (head || tail) { + if (pcpu_split_block(chunk, i, head, tail)) + return -ENOMEM; + if (head) { + i++; + off += head; + max_contig = max(chunk->map[i - 1], max_contig); + } + if (tail) + max_contig = max(chunk->map[i + 1], max_contig); + } + + /* update hint and mark allocated */ + if (is_last) + chunk->contig_hint = max_contig; /* fully scanned */ + else + chunk->contig_hint = max(chunk->contig_hint, + max_contig); + + chunk->free_size -= chunk->map[i]; + chunk->map[i] = -chunk->map[i]; + + pcpu_chunk_relocate(chunk, oslot); + return off; + } + + chunk->contig_hint = max_contig; /* fully scanned */ + pcpu_chunk_relocate(chunk, oslot); + + /* + * Tell the upper layer that this chunk has no area left. + * Note that this is not an error condition but a notification + * to upper layer that it needs to look at other chunks. + * -ENOSPC is chosen as it isn't used in memory subsystem and + * matches the meaning in a way. + */ + return -ENOSPC; +} + +/** + * pcpu_free_area - free area to a pcpu_chunk + * @chunk: chunk of interest + * @freeme: offset of area to free + * + * Free area starting from @freeme to @chunk. Note that this function + * only modifies the allocation map. It doesn't depopulate or unmap + * the area. + */ +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +{ + int oslot = pcpu_chunk_slot(chunk); + int i, off; + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) + if (off == freeme) + break; + BUG_ON(off != freeme); + BUG_ON(chunk->map[i] > 0); + + chunk->map[i] = -chunk->map[i]; + chunk->free_size += chunk->map[i]; + + /* merge with previous? */ + if (i > 0 && chunk->map[i - 1] >= 0) { + chunk->map[i - 1] += chunk->map[i]; + chunk->map_used--; + memmove(&chunk->map[i], &chunk->map[i + 1], + (chunk->map_used - i) * sizeof(chunk->map[0])); + i--; + } + /* merge with next? */ + if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { + chunk->map[i] += chunk->map[i + 1]; + chunk->map_used--; + memmove(&chunk->map[i + 1], &chunk->map[i + 2], + (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + } + + chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + pcpu_chunk_relocate(chunk, oslot); +} + +/** + * pcpu_unmap - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * @flush: whether to flush cache and tlb or not + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * If @flush is true, vcache is flushed before unmapping and tlb + * after. + */ +static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, + bool flush) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + + /* unmap must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + + /* + * Each flushing trial can be very expensive, issue flush on + * the whole region at once rather than doing it for each cpu. + * This could be an overkill but is more scalable. + */ + if (flush) + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + + for_each_possible_cpu(cpu) + unmap_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT); + + /* ditto as flush_cache_vunmap() */ + if (flush) + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, + bool flush) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int unmap_start = -1; + int uninitialized_var(unmap_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + if (!*pagep) + continue; + + __free_page(*pagep); + + /* + * If it's partial depopulation, it might get + * populated or depopulated again. Mark the + * page gone. + */ + *pagep = NULL; + + unmap_start = unmap_start < 0 ? i : unmap_start; + unmap_end = i + 1; + } + } + + if (unmap_start >= 0) + pcpu_unmap(chunk, unmap_start, unmap_end, flush); +} + +/** + * pcpu_map - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. + * vcache is flushed afterwards. + */ +static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + int err; + + /* map must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + + for_each_possible_cpu(cpu) { + err = map_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT, + PAGE_KERNEL, + pcpu_chunk_pagep(chunk, cpu, page_start)); + if (err < 0) + return err; + } + + /* flush at once, please read comments in pcpu_unmap() */ + flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + return 0; +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate in bytes + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int map_start = -1; + int uninitialized_var(map_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + if (pcpu_chunk_page_occupied(chunk, i)) { + if (map_start >= 0) { + if (pcpu_map(chunk, map_start, map_end)) + goto err; + map_start = -1; + } + continue; + } + + map_start = map_start < 0 ? i : map_start; + map_end = i + 1; + + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + *pagep = alloc_pages_node(cpu_to_node(cpu), + alloc_mask, 0); + if (!*pagep) + goto err; + } + } + + if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) + goto err; + + for_each_possible_cpu(cpu) + memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, + size); + + return 0; +err: + /* likely under heavy memory pressure, give memory back */ + pcpu_depopulate_chunk(chunk, off, size, true); + return -ENOMEM; +} + +static void free_pcpu_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + if (chunk->vm) + free_vm_area(chunk->vm); + pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); + kfree(chunk); +} + +static struct pcpu_chunk *alloc_pcpu_chunk(void) +{ + struct pcpu_chunk *chunk; + + chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; + + chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + if (!chunk->vm) { + free_pcpu_chunk(chunk); + return NULL; + } + + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; + + return chunk; +} + +/** + * __alloc_percpu - allocate percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + void *ptr = NULL; + struct pcpu_chunk *chunk; + int slot, off; + + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { + WARN(true, "illegal size (%zu) or align (%zu) for " + "percpu allocation\n", size, align); + return NULL; + } + + mutex_lock(&pcpu_mutex); + + /* allocate area */ + for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (size > chunk->contig_hint) + continue; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + if (off != -ENOSPC) + goto out_unlock; + } + } + + /* hmmm... no space left, create a new chunk */ + chunk = alloc_pcpu_chunk(); + if (!chunk) + goto out_unlock; + pcpu_chunk_relocate(chunk, -1); + pcpu_chunk_addr_insert(chunk); + + off = pcpu_alloc_area(chunk, size, align); + if (off < 0) + goto out_unlock; + +area_found: + /* populate, map and clear the area */ + if (pcpu_populate_chunk(chunk, off, size)) { + pcpu_free_area(chunk, off); + goto out_unlock; + } + + ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); +out_unlock: + mutex_unlock(&pcpu_mutex); + return ptr; +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +static void pcpu_kill_chunk(struct pcpu_chunk *chunk) +{ + WARN_ON(chunk->immutable); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + list_del(&chunk->list); + rb_erase(&chunk->rb_node, &pcpu_addr_root); + free_pcpu_chunk(chunk); +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. Might sleep. + */ +void free_percpu(void *ptr) +{ + void *addr = __pcpu_ptr_to_addr(ptr); + struct pcpu_chunk *chunk; + int off; + + if (!ptr) + return; + + mutex_lock(&pcpu_mutex); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->vm->addr; + + pcpu_free_area(chunk, off); + + /* the chunk became fully free, kill one if there are other free ones */ + if (chunk->free_size == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, + &pcpu_slot[pcpu_chunk_slot(chunk)], list) + if (pos != chunk) { + pcpu_kill_chunk(pos); + break; + } + } + + mutex_unlock(&pcpu_mutex); +} +EXPORT_SYMBOL_GPL(free_percpu); + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @get_page_fn: callback to fetch page pointer + * @static_size: the size of static percpu area in bytes + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto + * @free_size: free size in bytes, 0 for auto + * @base_addr: mapped address, NULL for auto + * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * + * Initialize the first percpu chunk which contains the kernel static + * perpcu area. This function is to be called from arch percpu area + * setup path. The first two parameters are mandatory. The rest are + * optional. + * + * @get_page_fn() should return pointer to percpu page given cpu + * number and page number. It should at least return enough pages to + * cover the static area. The returned pages for static area should + * have been initialized with valid data. If @unit_size is specified, + * it can also return pages after the static area. NULL return + * indicates end of pages for the cpu. Note that @get_page_fn() must + * return the same number of pages for all cpus. + * + * @unit_size, if non-zero, determines unit size and must be aligned + * to PAGE_SIZE and equal to or larger than @static_size + @free_size. + * + * @free_size determines the number of free bytes after the static + * area in the first chunk. If zero, whatever left is available. + * Specifying non-zero value make percpu leave the area after + * @static_size + @free_size alone. + * + * Non-null @base_addr means that the caller already allocated virtual + * region for the first chunk and mapped it. percpu must not mess + * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL + * @populate_pte_fn doesn't make any sense. + * + * @populate_pte_fn is used to populate the pagetable. NULL means the + * caller already populated the pagetable. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access. + */ +size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + size_t static_size, size_t unit_size, + size_t free_size, void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct static_vm; + struct pcpu_chunk *static_chunk; + unsigned int cpu; + int nr_pages; + int err, i; + + /* santiy checks */ + BUG_ON(!static_size); + BUG_ON(!unit_size && free_size); + BUG_ON(unit_size && unit_size < static_size + free_size); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(base_addr && !unit_size); + BUG_ON(base_addr && populate_pte_fn); + + if (unit_size) + pcpu_unit_pages = unit_size >> PAGE_SHIFT; + else + pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, + PFN_UP(static_size)); + + pcpu_static_size = static_size; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + /* + * Allocate chunk slots. The additional last slot is for + * empty chunks. + */ + pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; + pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_slot[i]); + + /* init static_chunk */ + static_chunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&static_chunk->list); + static_chunk->vm = &static_vm; + + if (free_size) + static_chunk->free_size = free_size; + else + static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + + static_chunk->contig_hint = static_chunk->free_size; + + /* allocate vm address */ + static_vm.flags = VM_ALLOC; + static_vm.size = pcpu_chunk_size; + + if (!base_addr) + vm_area_register_early(&static_vm, PAGE_SIZE); + else { + /* + * Pages already mapped. No need to remap into + * vmalloc area. In this case the static chunk can't + * be mapped or unmapped by percpu and is marked + * immutable. + */ + static_vm.addr = base_addr; + static_chunk->immutable = true; + } + + /* assign pages */ + nr_pages = -1; + for_each_possible_cpu(cpu) { + for (i = 0; i < pcpu_unit_pages; i++) { + struct page *page = get_page_fn(cpu, i); + + if (!page) + break; + *pcpu_chunk_pagep(static_chunk, cpu, i) = page; + } + + BUG_ON(i < PFN_UP(pcpu_static_size)); + + if (nr_pages < 0) + nr_pages = i; + else + BUG_ON(nr_pages != i); + } + + /* map them */ + if (populate_pte_fn) { + for_each_possible_cpu(cpu) + for (i = 0; i < nr_pages; i++) + populate_pte_fn(pcpu_chunk_addr(static_chunk, + cpu, i)); + + err = pcpu_map(static_chunk, 0, nr_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", + err); + } + + /* link static_chunk in */ + pcpu_chunk_relocate(static_chunk, -1); + pcpu_chunk_addr_insert(static_chunk); + + /* we're done */ + pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); + return pcpu_unit_size; +} diff --git a/trunk/mm/vmalloc.c b/trunk/mm/vmalloc.c index 11a929872ebd..af58324c361a 100644 --- a/trunk/mm/vmalloc.c +++ b/trunk/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +static int vmap_page_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; @@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr; } +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + int ret; + + ret = vmap_page_range_noflush(start, end, prot, pages); + flush_cache_vmap(start, end); + return ret; +} + static inline int is_vmalloc_or_module_addr(const void *x) { /* @@ -990,6 +1000,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @align: requested alignment + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm, size_t align) +{ + static size_t vm_init_off __initdata; + unsigned long addr; + + addr = ALIGN(VMALLOC_START + vm_init_off, align); + vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + + vm->addr = (void *)addr; + + vm->next = vmlist; + vmlist = vm; +} + void __init vmalloc_init(void) { struct vmap_area *va; @@ -1017,6 +1053,58 @@ void __init vmalloc_init(void) vmap_initialized = true; } +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vmap() on to-be-mapped areas + * before calling this function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_page_range_noflush(addr, addr + size, prot, pages); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vunmap() on to-be-mapped areas + * before calling this function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +{ + vunmap_page_range(addr, addr + size); +} + +/** + * unmap_kernel_range - unmap kernel VM area and flush cache and TLB + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Similar to unmap_kernel_range_noflush() but flushes vcache before + * the unmapping and tlb after. + */ void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; diff --git a/trunk/net/ipv4/af_inet.c b/trunk/net/ipv4/af_inet.c index 743f5542d65a..3a3dad801354 100644 --- a/trunk/net/ipv4/af_inet.c +++ b/trunk/net/ipv4/af_inet.c @@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field); int snmp_mib_init(void *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize); + ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); if (!ptr[0]) goto err0; - ptr[1] = __alloc_percpu(mibsize); + ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); if (!ptr[1]) goto err1; return 0; diff --git a/trunk/net/ipv4/route.c b/trunk/net/ipv4/route.c index 97f71153584f..bf895401218f 100644 --- a/trunk/net/ipv4/route.c +++ b/trunk/net/ipv4/route.c @@ -3376,7 +3376,7 @@ int __init ip_rt_init(void) int rc = 0; #ifdef CONFIG_NET_CLS_ROUTE - ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif