diff --git a/[refs] b/[refs] index 5b68eff60de7..d21761059362 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: caab36b593b44c97e3c7707c6a8054b320f8d622 +refs/heads/master: 540aca06b737cc38965b52eeceefba3d24376461 diff --git a/trunk/Makefile b/trunk/Makefile index c40d83aedebe..27fb890a2bff 100644 --- a/trunk/Makefile +++ b/trunk/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 29 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc6 NAME = Erotic Pickled Herring # *DOCUMENTATION* diff --git a/trunk/arch/alpha/mm/init.c b/trunk/arch/alpha/mm/init.c index af71d38c8e41..5d7a16eab312 100644 --- a/trunk/arch/alpha/mm/init.c +++ b/trunk/arch/alpha/mm/init.c @@ -189,21 +189,9 @@ callback_init(void * kernel_end) if (alpha_using_srm) { static struct vm_struct console_remap_vm; - unsigned long nr_pages = 0; - unsigned long vaddr; + unsigned long vaddr = VMALLOC_START; unsigned long i, j; - /* calculate needed size */ - for (i = 0; i < crb->map_entries; ++i) - nr_pages += crb->map[i].count; - - /* register the vm area */ - console_remap_vm.flags = VM_ALLOC; - console_remap_vm.size = nr_pages << PAGE_SHIFT; - vm_area_register_early(&console_remap_vm, PAGE_SIZE); - - vaddr = (unsigned long)console_remap_vm.addr; - /* Set up the third level PTEs and update the virtual addresses of the CRB entries. */ for (i = 0; i < crb->map_entries; ++i) { @@ -225,6 +213,12 @@ callback_init(void * kernel_end) vaddr += PAGE_SIZE; } } + + /* Let vmalloc know that we've allocated some space. */ + console_remap_vm.flags = VM_ALLOC; + console_remap_vm.addr = (void *) VMALLOC_START; + console_remap_vm.size = vaddr - VMALLOC_START; + vmlist = &console_remap_vm; } callback_init_done = 1; diff --git a/trunk/arch/arm/kernel/setup.c b/trunk/arch/arm/kernel/setup.c index 68d6494c0389..7049815d66d5 100644 --- a/trunk/arch/arm/kernel/setup.c +++ b/trunk/arch/arm/kernel/setup.c @@ -233,13 +233,12 @@ static void __init cacheid_init(void) unsigned int cachetype = read_cpuid_cachetype(); unsigned int arch = cpu_architecture(); - if (arch >= CPU_ARCH_ARMv6) { - if ((cachetype & (7 << 29)) == 4 << 29) { - /* ARMv7 register format */ - cacheid = CACHEID_VIPT_NONALIASING; - if ((cachetype & (3 << 14)) == 1 << 14) - cacheid |= CACHEID_ASID_TAGGED; - } else if (cachetype & (1 << 23)) + if (arch >= CPU_ARCH_ARMv7) { + cacheid = CACHEID_VIPT_NONALIASING; + if ((cachetype & (3 << 14)) == 1 << 14) + cacheid |= CACHEID_ASID_TAGGED; + } else if (arch >= CPU_ARCH_ARMv6) { + if (cachetype & (1 << 23)) cacheid = CACHEID_VIPT_ALIASING; else cacheid = CACHEID_VIPT_NONALIASING; diff --git a/trunk/arch/arm/mach-at91/pm.c b/trunk/arch/arm/mach-at91/pm.c index 7ac812dc055a..9bb4f043aa22 100644 --- a/trunk/arch/arm/mach-at91/pm.c +++ b/trunk/arch/arm/mach-at91/pm.c @@ -332,6 +332,7 @@ static int at91_pm_enter(suspend_state_t state) at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR)); error: + sdram_selfrefresh_disable(); target_state = PM_SUSPEND_ON; at91_irq_resume(); at91_gpio_resume(); diff --git a/trunk/arch/arm/mm/abort-ev6.S b/trunk/arch/arm/mm/abort-ev6.S index 94077fbd96b7..8a7f65ba14b7 100644 --- a/trunk/arch/arm/mm/abort-ev6.S +++ b/trunk/arch/arm/mm/abort-ev6.S @@ -23,8 +23,7 @@ ENTRY(v6_early_abort) #ifdef CONFIG_CPU_32v6K clrex #else - sub r1, sp, #4 @ Get unused stack location - strex r0, r1, [r1] @ Clear the exclusive monitor + strex r0, r1, [sp] @ Clear the exclusive monitor #endif mrc p15, 0, r1, c5, c0, 0 @ get FSR mrc p15, 0, r0, c6, c0, 0 @ get FAR diff --git a/trunk/arch/arm/plat-s3c64xx/irq-eint.c b/trunk/arch/arm/plat-s3c64xx/irq-eint.c index ebb305ce7689..1f7cc0067f5c 100644 --- a/trunk/arch/arm/plat-s3c64xx/irq-eint.c +++ b/trunk/arch/arm/plat-s3c64xx/irq-eint.c @@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq) u32 mask; mask = __raw_readl(S3C64XX_EINT0MASK); - mask &= ~eint_irq_to_bit(irq); + mask |= eint_irq_to_bit(irq); __raw_writel(mask, S3C64XX_EINT0MASK); } diff --git a/trunk/arch/avr32/Kconfig b/trunk/arch/avr32/Kconfig index 05fe3053dcae..b189680d18b0 100644 --- a/trunk/arch/avr32/Kconfig +++ b/trunk/arch/avr32/Kconfig @@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt" config QUICKLIST def_bool y -config HAVE_ARCH_BOOTMEM +config HAVE_ARCH_BOOTMEM_NODE def_bool n config ARCH_HAVE_MEMORY_PRESENT diff --git a/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c b/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c index d6b772ba3b8f..fb371f5ce132 100644 --- a/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c +++ b/trunk/arch/powerpc/platforms/86xx/gef_sbc610.c @@ -142,10 +142,6 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev) { unsigned int val; - /* Do not do the fixup on other platforms! */ - if (!machine_is(gef_sbc610)) - return; - printk(KERN_INFO "Running NEC uPD720101 Fixup\n"); /* Ensure ports 1, 2, 3, 4 & 5 are enabled */ diff --git a/trunk/arch/s390/crypto/aes_s390.c b/trunk/arch/s390/crypto/aes_s390.c index 6118890c946d..c42cd898f68b 100644 --- a/trunk/arch/s390/crypto/aes_s390.c +++ b/trunk/arch/s390/crypto/aes_s390.c @@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void) module_init(aes_s390_init); module_exit(aes_s390_fini); -MODULE_ALIAS("aes-all"); +MODULE_ALIAS("aes"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig index 31758378bcd2..469f3450bf81 100644 --- a/trunk/arch/x86/Kconfig +++ b/trunk/arch/x86/Kconfig @@ -138,9 +138,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP @@ -783,11 +780,6 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. -config X86_MCE_THRESHOLD - depends on X86_MCE_AMD || X86_MCE_INTEL - bool - default y - config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_32 && X86_MCE @@ -1133,7 +1125,7 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -config HAVE_ARCH_BOOTMEM +config HAVE_ARCH_BOOTMEM_NODE def_bool y depends on X86_32 && NUMA diff --git a/trunk/arch/x86/include/asm/apicdef.h b/trunk/arch/x86/include/asm/apicdef.h index bc9514fb3b13..63134e31e8b9 100644 --- a/trunk/arch/x86/include/asm/apicdef.h +++ b/trunk/arch/x86/include/asm/apicdef.h @@ -53,7 +53,6 @@ #define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_ILLREGA 0x00080 -#define APIC_LVTCMCI 0x2f0 #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 diff --git a/trunk/arch/x86/include/asm/cacheflush.h b/trunk/arch/x86/include/asm/cacheflush.h index 5b301b7ff5f4..2f8466540fb5 100644 --- a/trunk/arch/x86/include/asm/cacheflush.h +++ b/trunk/arch/x86/include/asm/cacheflush.h @@ -5,43 +5,24 @@ #include /* Caches aren't brain-dead on the intel. */ -static inline void flush_cache_all(void) { } -static inline void flush_cache_mm(struct mm_struct *mm) { } -static inline void flush_cache_dup_mm(struct mm_struct *mm) { } -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) { } -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long vmaddr, unsigned long pfn) { } -static inline void flush_dcache_page(struct page *page) { } -static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } -static inline void flush_icache_range(unsigned long start, - unsigned long end) { } -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) { } -static inline void flush_icache_user_range(struct vm_area_struct *vma, - struct page *page, - unsigned long addr, - unsigned long len) { } -static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } -static inline void flush_cache_vunmap(unsigned long start, - unsigned long end) { } +#define flush_cache_all() do { } while (0) +#define flush_cache_mm(mm) do { } while (0) +#define flush_cache_dup_mm(mm) do { } while (0) +#define flush_cache_range(vma, start, end) do { } while (0) +#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define flush_dcache_page(page) do { } while (0) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) +#define flush_icache_range(start, end) do { } while (0) +#define flush_icache_page(vma, pg) do { } while (0) +#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) +#define flush_cache_vmap(start, end) do { } while (0) +#define flush_cache_vunmap(start, end) do { } while (0) -static inline void copy_to_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, const void *src, - unsigned long len) -{ - memcpy(dst, src, len); -} - -static inline void copy_from_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, const void *src, - unsigned long len) -{ - memcpy(dst, src, len); -} +#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ + memcpy((dst), (src), (len)) +#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ + memcpy((dst), (src), (len)) #define PG_non_WB PG_arch_1 PAGEFLAG(NonWB, non_WB) diff --git a/trunk/arch/x86/include/asm/efi.h b/trunk/arch/x86/include/asm/efi.h index edc90f23e708..ca5ffb2856b6 100644 --- a/trunk/arch/x86/include/asm/efi.h +++ b/trunk/arch/x86/include/asm/efi.h @@ -37,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ +#define MAX_EFI_IO_PAGES 100 + extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/trunk/arch/x86/include/asm/fixmap.h b/trunk/arch/x86/include/asm/fixmap.h index 63a79c77d220..dca8f03da5b2 100644 --- a/trunk/arch/x86/include/asm/fixmap.h +++ b/trunk/arch/x86/include/asm/fixmap.h @@ -24,6 +24,9 @@ #include #else #include +#ifdef CONFIG_EFI +#include +#endif #endif /* @@ -89,6 +92,13 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif +#ifdef CONFIG_X86_64 +#ifdef CONFIG_EFI + FIX_EFI_IO_MAP_LAST_PAGE, + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE + + MAX_EFI_IO_PAGES - 1, +#endif +#endif #ifdef CONFIG_X86_VISWS_APIC FIX_CO_CPU, /* Cobalt timer */ FIX_CO_APIC, /* Cobalt APIC Redirection Table */ diff --git a/trunk/arch/x86/include/asm/i387.h b/trunk/arch/x86/include/asm/i387.h index 71c9e5183982..48f0004db8c9 100644 --- a/trunk/arch/x86/include/asm/i387.h +++ b/trunk/arch/x86/include/asm/i387.h @@ -172,13 +172,7 @@ static inline void __save_init_fpu(struct task_struct *tsk) #else /* CONFIG_X86_32 */ -#ifdef CONFIG_MATH_EMULATION -extern void finit_task(struct task_struct *tsk); -#else -static inline void finit_task(struct task_struct *tsk) -{ -} -#endif +extern void finit(void); static inline void tolerant_fwait(void) { diff --git a/trunk/arch/x86/include/asm/io.h b/trunk/arch/x86/include/asm/io.h index e5383e3d2f8c..683d0b4c00fc 100644 --- a/trunk/arch/x86/include/asm/io.h +++ b/trunk/arch/x86/include/asm/io.h @@ -172,6 +172,8 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); + #ifdef CONFIG_X86_32 # include "io_32.h" @@ -196,6 +198,7 @@ extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); extern void __iomem *early_memremap(unsigned long offset, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define IO_SPACE_LIMIT 0xffff diff --git a/trunk/arch/x86/include/asm/mce.h b/trunk/arch/x86/include/asm/mce.h index 563933e06a35..32c6e17b960b 100644 --- a/trunk/arch/x86/include/asm/mce.h +++ b/trunk/arch/x86/include/asm/mce.h @@ -11,8 +11,6 @@ */ #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ -#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ -#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ @@ -92,29 +90,14 @@ extern int mce_disabled; #include -void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -/* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. - */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) - #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); -void cmci_clear(void); -void cmci_reenable(void); -void cmci_rediscover(int dying); -void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } -static inline void cmci_clear(void) {} -static inline void cmci_reenable(void) {} -static inline void cmci_rediscover(int dying) {} -static inline void cmci_recheck(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -123,23 +106,11 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -extern int mce_available(struct cpuinfo_x86 *c); - -void mce_log_therm_throt_event(__u64 status); +void mce_log_therm_throt_event(unsigned int cpu, __u64 status); extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); - -typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); -DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); - -enum mcp_flags { - MCP_TIMESTAMP = (1 << 0), /* log time stamp */ - MCP_UC = (1 << 1), /* log uncorrected errors */ -}; -extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); - extern int mce_notify_user(void); #endif /* !CONFIG_X86_32 */ @@ -149,8 +120,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif - -extern void (*mce_threshold_vector)(void); +extern void stop_mce(void); +extern void restart_mce(void); #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/trunk/arch/x86/include/asm/mmzone_32.h b/trunk/arch/x86/include/asm/mmzone_32.h index ede6998bd92c..105fb90a0635 100644 --- a/trunk/arch/x86/include/asm/mmzone_32.h +++ b/trunk/arch/x86/include/asm/mmzone_32.h @@ -91,9 +91,46 @@ static inline int pfn_valid(int pfn) #endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_NEED_MULTIPLE_NODES -/* always use node 0 for bootmem on this numa platform */ -#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ - (NODE_DATA(0)->bdata) + +/* + * Following are macros that are specific to this numa platform. + */ +#define reserve_bootmem(addr, size, flags) \ + reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) +#define alloc_bootmem(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_nopanic(x) \ + __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ + __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) +#define alloc_bootmem_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_nopanic(x) \ + __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ + __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) +#define alloc_bootmem_node(pgdat, x) \ +({ \ + struct pglist_data __maybe_unused \ + *__alloc_bootmem_node__pgdat = (pgdat); \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ + __pa(MAX_DMA_ADDRESS)); \ +}) +#define alloc_bootmem_pages_node(pgdat, x) \ +({ \ + struct pglist_data __maybe_unused \ + *__alloc_bootmem_node__pgdat = (pgdat); \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ + __pa(MAX_DMA_ADDRESS)); \ +}) +#define alloc_bootmem_low_pages_node(pgdat, x) \ +({ \ + struct pglist_data __maybe_unused \ + *__alloc_bootmem_node__pgdat = (pgdat); \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ +}) #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/trunk/arch/x86/include/asm/msr-index.h b/trunk/arch/x86/include/asm/msr-index.h index 2dbd2314139e..358acc59ae04 100644 --- a/trunk/arch/x86/include/asm/msr-index.h +++ b/trunk/arch/x86/include/asm/msr-index.h @@ -77,11 +77,6 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 -/* These are consecutive and not in the normal 4er MCE bank block */ -#define MSR_IA32_MC0_CTL2 0x00000280 -#define CMCI_EN (1ULL << 30) -#define CMCI_THRESHOLD_MASK 0xffffULL - #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 diff --git a/trunk/arch/x86/include/asm/percpu.h b/trunk/arch/x86/include/asm/percpu.h index 8f1d2fbec1d4..aee103b26d01 100644 --- a/trunk/arch/x86/include/asm/percpu.h +++ b/trunk/arch/x86/include/asm/percpu.h @@ -43,14 +43,6 @@ #else /* ...!ASSEMBLY */ #include -#include - -#define __addr_to_pcpu_ptr(addr) \ - (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ - + (unsigned long)__per_cpu_start) -#define __pcpu_ptr_to_addr(ptr) \ - (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h index d0812e155f1d..1c097a3a6669 100644 --- a/trunk/arch/x86/include/asm/pgtable.h +++ b/trunk/arch/x86/include/asm/pgtable.h @@ -288,8 +288,6 @@ static inline int is_new_memtype_allowed(unsigned long flags, return 1; } -pmd_t *populate_extra_pmd(unsigned long vaddr); -pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 diff --git a/trunk/arch/x86/include/asm/xen/page.h b/trunk/arch/x86/include/asm/xen/page.h index 1a918dde46b5..4bd990ee43df 100644 --- a/trunk/arch/x86/include/asm/xen/page.h +++ b/trunk/arch/x86/include/asm/xen/page.h @@ -164,7 +164,6 @@ static inline pte_t __pte_ma(pteval_t x) xmaddr_t arbitrary_virt_to_machine(void *address); -unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/trunk/arch/x86/kernel/alternative.c b/trunk/arch/x86/kernel/alternative.c index 4c80f1557433..6907b8e85d52 100644 --- a/trunk/arch/x86/kernel/alternative.c +++ b/trunk/arch/x86/kernel/alternative.c @@ -414,17 +414,9 @@ void __init alternative_instructions(void) that might execute the to be patched code. Other CPUs are not running. */ stop_nmi(); - - /* - * Don't stop machine check exceptions while patching. - * MCEs only happen when something got corrupted and in this - * case we must do something about the corruption. - * Ignoring it is worse than a unlikely patching race. - * Also machine checks tend to be broadcast and if one CPU - * goes into machine check the others follow quickly, so we don't - * expect a machine check to cause undue problems during to code - * patching. - */ +#ifdef CONFIG_X86_MCE + stop_mce(); +#endif apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -464,6 +456,9 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); restart_nmi(); +#ifdef CONFIG_X86_MCE + restart_mce(); +#endif } /** diff --git a/trunk/arch/x86/kernel/apic/apic.c b/trunk/arch/x86/kernel/apic/apic.c index 30909a258d0f..f9cecdfd05c5 100644 --- a/trunk/arch/x86/kernel/apic/apic.c +++ b/trunk/arch/x86/kernel/apic/apic.c @@ -46,7 +46,6 @@ #include #include #include -#include unsigned int num_processors; @@ -843,14 +842,6 @@ void clear_local_APIC(void) apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif -#ifdef CONFIG_X86_MCE_INTEL - if (maxlvt >= 6) { - v = apic_read(APIC_LVTCMCI); - if (!(v & APIC_LVT_MASKED)) - apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); - } -#endif - /* * Clean APIC state for other OSs: */ @@ -1250,12 +1241,6 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_LVT1, value); preempt_enable(); - -#ifdef CONFIG_X86_MCE_INTEL - /* Recheck CMCI information after local APIC is up on CPU #0 */ - if (smp_processor_id() == 0) - cmci_recheck(); -#endif } void __cpuinit end_local_APIC_setup(void) diff --git a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 22590cf688ae..4b1c319d30c3 100644 --- a/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/trunk/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); + data->acpi_data = percpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/trunk/arch/x86/kernel/cpu/mcheck/Makefile b/trunk/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe8..d7d2323bbb69 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/Makefile +++ b/trunk/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,4 +4,3 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o -obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c index 3552119b091d..dfaebce3633e 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c) } } +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c index bfbd5323a635..fe79985ce0f2 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,8 +3,6 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen */ #include @@ -26,9 +24,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -37,6 +32,7 @@ #include #define MISC_MCELOG_MINOR 227 +#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -51,7 +47,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static u64 *bank; +static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -62,19 +58,6 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); -/* MCA banks polled by the period polling timer for corrected events */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { - [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ - memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); - rdtscll(m->tsc); -} - /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -136,11 +119,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %llx ", m->tsc); + printk(KERN_EMERG "TSC %Lx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk("ADDR %Lx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); + printk("MISC %Lx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -166,10 +149,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -int mce_available(struct cpuinfo_x86 *c) +static int mce_available(struct cpuinfo_x86 *c) { - if (mce_dont_init) - return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -191,77 +172,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ - struct mce m; - int i; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. - * - * TBD do the same check for MCI_STATUS_EN here? - */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) - continue; - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - /* - * Don't get the IP here because it's unlikely to - * have anything to do with the actual error location. - */ - - mce_log(&m); - add_taint(TAINT_MACHINE_CHECK); - - /* - * Clear state for this bank. - */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - - /* - * Don't clear MCG_STATUS here because it's only defined for - * exceptions. - */ -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! + * The actual machine check handler */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -279,18 +190,17 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if (notify_die(DIE_NMI, "machine check", regs, error_code, + if ((regs + && notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - goto out2; - if (!banks) + || !banks) goto out2; - mce_setup(&m); - + memset(&m, 0, sizeof(struct mce)); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -300,32 +210,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - __clear_bit(i, toclear); - if (!bank[i]) + if (i < NR_SYSFS_BANKS && !bank[i]) continue; m.misc = 0; m.addr = 0; m.bank = i; + m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) continue; - /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. - */ - if ((m.status & MCI_STATUS_UC) == 0) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK); - - __set_bit(i, toclear); - if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -339,12 +235,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } - } else { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; } if (m.status & MCI_STATUS_MISCV) @@ -353,7 +243,10 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - mce_log(&m); + if (error_code >= 0) + rdtscll(m.tsc); + if (error_code != -2) + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -362,8 +255,14 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } + + add_taint(TAINT_MACHINE_CHECK); } + /* Never do anything final in the polling timer */ + if (!regs) + goto out; + /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -410,11 +309,10 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); + out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -434,13 +332,15 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(__u64 status) +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) { struct mce m; - mce_setup(&m); + memset(&m, 0, sizeof(m)); + m.cpu = cpu; m.bank = MCE_THERMAL_BANK; m.status = status; + rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ @@ -453,18 +353,18 @@ void mce_log_therm_throt_event(__u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(unsigned long); -static DEFINE_PER_CPU(struct timer_list, mce_timer); +static void mcheck_timer(struct work_struct *work); +static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); -static void mcheck_timer(unsigned long data) +static void mcheck_check_cpu(void *info) { - struct timer_list *t = &per_cpu(mce_timer, data); - - WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); + do_machine_check(NULL, 0); +} + +static void mcheck_timer(struct work_struct *work) +{ + on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -477,41 +377,31 @@ static void mcheck_timer(unsigned long data) (int)round_jiffies_relative(check_interval*HZ)); } - t->expires = jiffies + next_interval; - add_timer(t); -} - -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); + schedule_delayed_work(&mcheck_work, next_interval); } -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - /* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. + * This is only called from process context. This is where we do + * anything we need to alert userspace about new MCEs. This is called + * directly from the poller and also from entry.S and idle, thanks to + * TIF_MCE_NOTIFY. */ int mce_notify_user(void) { - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - wake_up_interruptible(&mce_wait); + static unsigned long last_print; + unsigned long now = jiffies; - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (trigger[0] && !work_pending(&mce_trigger_work)) - schedule_work(&mce_trigger_work); + wake_up_interruptible(&mce_wait); + if (trigger[0]) + call_usermodehelper(trigger, trigger_argv, NULL, + UMH_NO_WAIT); - if (__ratelimit(&ratelimit)) + if (time_after_eq(now, last_print + (check_interval*HZ))) { + last_print = now; printk(KERN_INFO "Machine check events logged\n"); + } return 1; } @@ -535,78 +425,63 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - idle_notifier_register(&mce_idle_notifier); - return 0; + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); + /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static void mce_init(void *dummy) { u64 cap; - unsigned b; + int i; rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; - if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); - b = MAX_NR_BANKS; + banks = cap & 0xff; + if (banks > MCE_EXTENDED_BANK) { + banks = MCE_EXTENDED_BANK; + printk(KERN_INFO "MCE: warning: using only %d banks\n", + MCE_EXTENDED_BANK); } - - /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); - } - /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) rip_msr = MSR_IA32_MCG_EIP; - return 0; -} - -static void mce_init(void *dummy) -{ - u64 cap; - int i; - mce_banks_t all_banks; - - /* - * Log the machine checks left over from the previous reset. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC, &all_banks); + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + if (i < NR_SYSFS_BANKS) + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + else + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) + if(c->x86 == 15) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, &bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -629,38 +504,20 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) -{ - struct timer_list *t = &__get_cpu_var(mce_timer); - - /* data race harmless because everyone sets to the same value */ - if (!next_interval) - next_interval = check_interval * HZ; - if (!next_interval) - return; - setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies_relative(jiffies + next_interval); - add_timer(t); -} - /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - if (!mce_available(c)) - return; + mce_cpu_quirks(c); - if (mce_cap_init() < 0) { - mce_dont_init = 1; + if (mce_dont_init || + !mce_available(c)) return; - } - mce_cpu_quirks(c); mce_init(NULL); mce_cpu_features(c); - mce_init_timer(); } /* @@ -716,7 +573,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned prev, next; + unsigned next; char __user *buf = ubuf; int i, err; @@ -735,32 +592,25 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); - goto timeout; - } - cpu_relax(); + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + goto timeout; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); -timeout: - ; + cpu_relax(); } + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); + buf += sizeof(struct mce); + timeout: + ; + } - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; synchronize_sched(); @@ -830,6 +680,20 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + /* * Old style boot options parsing. Only for compatibility. */ @@ -839,7 +703,8 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. +/* mce=off disables machine check. Note you can re-enable it later + using sysfs. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ @@ -863,29 +728,6 @@ __setup("mce=", mcheck_enable); * Sysfs support */ -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static int mce_disable(void) -{ - int i; - - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); - return 0; -} - -static int mce_suspend(struct sys_device *dev, pm_message_t state) -{ - return mce_disable(); -} - -static int mce_shutdown(struct sys_device *dev) -{ - return mce_disable(); -} - /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -896,24 +738,20 @@ static int mce_resume(struct sys_device *dev) return 0; } -static void mce_cpu_restart(void *data) -{ - del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(NULL); - mce_init_timer(); -} - /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { + if (next_interval) + cancel_delayed_work(&mcheck_work); + /* Timer race is harmless here */ + on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - on_each_cpu(mce_cpu_restart, NULL, 1); + if (next_interval) + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; @@ -940,26 +778,16 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -static struct sysdev_attribute *bank_attrs; - -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) -{ - u64 b = bank[attr - bank_attrs]; - return sprintf(buf, "%llx\n", b); -} - -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) -{ - char *end; - u64 new = simple_strtoull(buf, &end, 0); - if (end == buf) - return -EINVAL; - bank[attr - bank_attrs] = new; - mce_restart(); - return end-buf; -} +/* + * TBD should generate these dynamically based on number of available banks. + * Have only 6 contol banks in /sysfs until then. + */ +ACCESSOR(bank0ctl,bank[0],mce_restart()) +ACCESSOR(bank1ctl,bank[1],mce_restart()) +ACCESSOR(bank2ctl,bank[2],mce_restart()) +ACCESSOR(bank3ctl,bank[3],mce_restart()) +ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(bank5ctl,bank[5],mce_restart()) static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -986,6 +814,8 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -1015,22 +845,11 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } - for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - if (err) - goto error2; - } cpu_set(cpu, mce_device_initialized); return 0; -error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } error: - while (--i >= 0) { + while (i--) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -1049,46 +868,15 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); - for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) -{ - int i; - unsigned long action = *(unsigned long *)h; - - if (!mce_available(¤t_cpu_data)) - return; - if (!(action & CPU_TASKS_FROZEN)) - cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -} - -static void mce_reenable_cpu(void *h) -{ - int i; - unsigned long action = *(unsigned long *)h; - - if (!mce_available(¤t_cpu_data)) - return; - if (!(action & CPU_TASKS_FROZEN)) - cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); -} - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -1103,21 +891,6 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies_relative(jiffies + next_interval); - add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - break; - case CPU_POST_DEAD: - /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; } return NOTIFY_OK; } @@ -1126,34 +899,6 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) -{ - int i; - - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; - } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - return -ENOMEM; -} - static __init int mce_init_device(void) { int err; @@ -1161,11 +906,6 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - - err = mce_init_banks(); - if (err) - return err; - err = sysdev_class_register(&mce_sysclass); if (err) return err; diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index c5a32f92d07e..9817506dd469 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,8 +79,6 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ -static void amd_threshold_interrupt(void); - /* * CPU Initialization */ @@ -176,8 +174,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); - - mce_threshold_vector = amd_threshold_interrupt; } } } @@ -191,13 +187,19 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -static void amd_threshold_interrupt(void) +asmlinkage void mce_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - mce_setup(&m); + ack_APIC_irq(); + exit_idle(); + irq_enter(); + + memset(&m, 0, sizeof(m)); + rdtscll(m.tsc); + m.cpu = smp_processor_id(); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { @@ -231,8 +233,7 @@ static void amd_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); + do_machine_check(NULL, 0); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); @@ -242,10 +243,13 @@ static void amd_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - return; + goto out; } } } +out: + inc_irq_stat(irq_threshold_count); + irq_exit(); } /* diff --git a/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aaa7d9730938..aa5e287c98e0 100644 --- a/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/trunk/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,8 +1,6 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo - * Copyright (C) 2008, 2009 Intel Corporation - * Author: Andi Kleen */ #include @@ -15,7 +13,6 @@ #include #include #include -#include asmlinkage void smp_thermal_interrupt(void) { @@ -28,7 +25,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(msr_val); + mce_log_therm_throt_event(smp_processor_id(), msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); @@ -88,209 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } -/* - * Support for Intel Correct Machine Check Interrupts. This allows - * the CPU to raise an interrupt when a corrected machine check happened. - * Normally we pick those up using a regular polling timer. - * Also supports reliable discovery of shared banks. - */ - -static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); - -/* - * cmci_discover_lock protects against parallel discovery attempts - * which could race against each other. - */ -static DEFINE_SPINLOCK(cmci_discover_lock); - -#define CMCI_THRESHOLD 1 - -static int cmci_supported(int *banks) -{ - u64 cap; - - /* - * Vendor check is not strictly needed, but the initial - * initialization is vendor keyed and this - * makes sure none of the backdoors are entered otherwise. - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - if (!cpu_has_apic || lapic_get_maxlvt() < 6) - return 0; - rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); - return !!(cap & MCG_CMCI_P); -} - -/* - * The interrupt handler. This is called on every event. - * Just call the poller directly to log any events. - * This could in theory increase the threshold under high load, - * but doesn't for now. - */ -static void intel_threshold_interrupt(void) -{ - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_user(); -} - -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - -/* - * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks - * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. - */ -static void cmci_discover(int banks, int boot) -{ - unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); - int hdr = 0; - int i; - - spin_lock(&cmci_discover_lock); - for (i = 0; i < banks; i++) { - u64 val; - - if (test_bit(i, owned)) - continue; - - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - - /* Already owned by someone else? */ - if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) - print_update("SHD", &hdr, i); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); - continue; - } - - val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) - print_update("CMCI", &hdr, i); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); - } else { - WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); - } - } - spin_unlock(&cmci_discover_lock); - if (hdr) - printk(KERN_CONT "\n"); -} - -/* - * Just in case we missed an event during initialization check - * all the CMCI owned banks. - */ -void cmci_recheck(void) -{ - unsigned long flags; - int banks; - - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) - return; - local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - local_irq_restore(flags); -} - -/* - * Disable CMCI on this CPU for all banks it owns when it goes down. - * This allows other CPUs to claim the banks on rediscovery. - */ -void cmci_clear(void) -{ - int i; - int banks; - u64 val; - - if (!cmci_supported(&banks)) - return; - spin_lock(&cmci_discover_lock); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } - spin_unlock(&cmci_discover_lock); -} - -/* - * After a CPU went down cycle through all the others and rediscover - * Must run in process context. - */ -void cmci_rediscover(int dying) -{ - int banks; - int cpu; - cpumask_var_t old; - - if (!cmci_supported(&banks)) - return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); - - for_each_online_cpu (cpu) { - if (cpu == dying) - continue; - if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) - continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks, 0); - } - - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); -} - -/* - * Reenable CMCI on this CPU in case a CPU down failed. - */ -void cmci_reenable(void) -{ - int banks; - if (cmci_supported(&banks)) - cmci_discover(banks, 0); -} - -static __cpuinit void intel_init_cmci(void) -{ - int banks; - - if (!cmci_supported(&banks)) - return; - - mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); - /* - * For CPU #0 this runs with still disabled APIC, but that's - * ok because only the vector is set up. We still do another - * check for the banks later for CPU #0 just to make sure - * to not miss any events. - */ - apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); - cmci_recheck(); -} - void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); - intel_init_cmci(); } diff --git a/trunk/arch/x86/kernel/cpu/mcheck/threshold.c b/trunk/arch/x86/kernel/cpu/mcheck/threshold.c deleted file mode 100644 index 23ee9e730f78..000000000000 --- a/trunk/arch/x86/kernel/cpu/mcheck/threshold.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Common corrected MCE threshold handler code: - */ -#include -#include - -#include -#include -#include -#include - -static void default_threshold_interrupt(void) -{ - printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", - THRESHOLD_APIC_VECTOR); -} - -void (*mce_threshold_vector)(void) = default_threshold_interrupt; - -asmlinkage void mce_threshold_interrupt(void) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_threshold_count); - mce_threshold_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); -} diff --git a/trunk/arch/x86/kernel/efi.c b/trunk/arch/x86/kernel/efi.c index 1736acc4d7aa..b205272ad394 100644 --- a/trunk/arch/x86/kernel/efi.c +++ b/trunk/arch/x86/kernel/efi.c @@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages, end_pfn; + u64 end, systab, addr, npages; void *p, *va; efi.systab = NULL; @@ -481,10 +481,7 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) + if (PFN_UP(end) <= max_low_pfn_mapped) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/trunk/arch/x86/kernel/efi_64.c b/trunk/arch/x86/kernel/efi_64.c index 22c3b7828c50..a4ee29127fdf 100644 --- a/trunk/arch/x86/kernel/efi_64.c +++ b/trunk/arch/x86/kernel/efi_64.c @@ -100,11 +100,24 @@ void __init efi_call_phys_epilog(void) void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { - unsigned long last_map_pfn; + static unsigned pages_mapped __initdata; + unsigned i, pages; + unsigned long offset; - last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) + pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + + if (pages_mapped + pages > MAX_EFI_IO_PAGES) return NULL; - return (void __iomem *)__va(phys_addr); + for (i = 0; i < pages; i++) { + __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, + phys_addr, PAGE_KERNEL); + phys_addr += PAGE_SIZE; + pages_mapped++; + } + + return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ + (pages_mapped - pages)) + offset; } diff --git a/trunk/arch/x86/kernel/i387.c b/trunk/arch/x86/kernel/i387.c index f2f8540a7f3d..b0f61f0dcd0a 100644 --- a/trunk/arch/x86/kernel/i387.c +++ b/trunk/arch/x86/kernel/i387.c @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { memset(tsk->thread.xstate, 0, xstate_size); - finit_task(tsk); + finit(); set_stopped_child_used_math(tsk); return 0; } diff --git a/trunk/arch/x86/kernel/irq_32.c b/trunk/arch/x86/kernel/irq_32.c index 3b09634a5153..9dc6b2b24275 100644 --- a/trunk/arch/x86/kernel/irq_32.c +++ b/trunk/arch/x86/kernel/irq_32.c @@ -16,7 +16,6 @@ #include #include #include -#include #include @@ -56,13 +55,13 @@ static inline void print_stack_overflow(void) { } union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -} __attribute__((aligned(PAGE_SIZE))); +}; -static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); -static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); -static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { @@ -82,7 +81,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = __get_cpu_var(hardirq_ctx); + irqctx = hardirq_ctx[smp_processor_id()]; /* * this is where we switch to the IRQ stack. However, if we are @@ -126,34 +125,34 @@ void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; - if (per_cpu(hardirq_ctx, cpu)) + if (hardirq_ctx[cpu]) return; - irqctx = &per_cpu(hardirq_stack, cpu); + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - per_cpu(hardirq_ctx, cpu) = irqctx; + hardirq_ctx[cpu] = irqctx; - irqctx = &per_cpu(softirq_stack, cpu); + irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - per_cpu(softirq_ctx, cpu) = irqctx; + softirq_ctx[cpu] = irqctx; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); + cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); } void irq_ctx_exit(int cpu) { - per_cpu(hardirq_ctx, cpu) = NULL; + hardirq_ctx[cpu] = NULL; } asmlinkage void do_softirq(void) @@ -170,7 +169,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = __get_cpu_var(softirq_ctx); + irqctx = softirq_ctx[smp_processor_id()]; irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; diff --git a/trunk/arch/x86/kernel/reboot.c b/trunk/arch/x86/kernel/reboot.c index 2aef36d8aca2..1cc18d439bbb 100644 --- a/trunk/arch/x86/kernel/reboot.c +++ b/trunk/arch/x86/kernel/reboot.c @@ -216,14 +216,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, - { /* Handle problems with rebooting on Dell XPS710 */ - .callback = set_bios_reboot, - .ident = "Dell XPS710", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), - }, - }, { } }; diff --git a/trunk/arch/x86/kernel/setup.c b/trunk/arch/x86/kernel/setup.c index b746deb9ebc6..4c54bc0d8ff3 100644 --- a/trunk/arch/x86/kernel/setup.c +++ b/trunk/arch/x86/kernel/setup.c @@ -770,9 +770,6 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); - if (efi_enabled) - efi_init(); - dmi_scan_machine(); dmi_check_system(bad_bios_dmi_table); @@ -792,6 +789,8 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); + if (efi_enabled) + efi_init(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { diff --git a/trunk/arch/x86/kernel/setup_percpu.c b/trunk/arch/x86/kernel/setup_percpu.c index c29f301d3885..d992e6cff730 100644 --- a/trunk/arch/x86/kernel/setup_percpu.c +++ b/trunk/arch/x86/kernel/setup_percpu.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -42,321 +41,6 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); -/** - * pcpu_need_numa - determine percpu allocation needs to consider NUMA - * - * If NUMA is not configured or there is only one NUMA node available, - * there is no reason to consider NUMA. This function determines - * whether percpu allocation should consider NUMA or not. - * - * RETURNS: - * true if NUMA should be considered; otherwise, false. - */ -static bool __init pcpu_need_numa(void) -{ -#ifdef CONFIG_NEED_MULTIPLE_NODES - pg_data_t *last = NULL; - unsigned int cpu; - - for_each_possible_cpu(cpu) { - int node = early_cpu_to_node(cpu); - - if (node_online(node) && NODE_DATA(node) && - last && last != NODE_DATA(node)) - return true; - - last = NODE_DATA(node); - } -#endif - return false; -} - -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES - int node = early_cpu_to_node(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = __alloc_bootmem_nopanic(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), - size, align, goal); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return __alloc_bootmem_nopanic(size, align, goal); -#endif -} - -/* - * Remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; - -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpur_size) - return NULL; - - return virt_to_page(pcpur_ptrs[cpu] + off); -} - -static ssize_t __init setup_pcpu_remap(size_t static_size) -{ - static struct vm_struct vm; - pg_data_t *last; - size_t ptrs_size; - unsigned int cpu; - ssize_t ret; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - */ - if (!cpu_has_pse || pcpu_need_numa()) - return -EINVAL; - - last = NULL; - for_each_possible_cpu(cpu) { - int node = early_cpu_to_node(cpu); - - if (node_online(node) && NODE_DATA(node) && - last && last != NODE_DATA(node)) - goto proceed; - - last = NODE_DATA(node); - } - return -EINVAL; - -proceed: - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - - /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); - - for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) - goto enomem; - - /* - * Only use pcpur_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); - - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); - } - - /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd; - - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), - PAGE_KERNEL_LARGE)); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, - pcpur_size - static_size, vm.addr, NULL); - goto out_free_ar; - -enomem: - for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); - return ret; -} -#else -static ssize_t __init setup_pcpu_remap(size_t static_size) -{ - return -EINVAL; -} -#endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using - * bootmem allocator and used as-is without being mapped into vmalloc - * area. This enables the first chunk to piggy back on the linear - * physical PMD mapping and doesn't add any additional pressure to - * TLB. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_unit_size __initdata; - -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size - + ((size_t)pageno << PAGE_SHIFT)); -} - -static ssize_t __init setup_pcpu_embed(size_t static_size) -{ - unsigned int cpu; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!cpu_has_pse || pcpu_need_numa()) - return -EINVAL; - - /* allocate and copy */ - pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); - pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE); - if (!pcpue_ptr) - return -ENOMEM; - - for_each_possible_cpu(cpu) - memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, - static_size); - - /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); - - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - pcpue_unit_size, - pcpue_unit_size - static_size, pcpue_ptr, - NULL); -} - -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - -static void __init pcpu4k_populate_pte(unsigned long addr) -{ - populate_extra_pte(addr); -} - -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) - goto enomem; - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, - pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -377,35 +61,38 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; - unsigned int cpu; - unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + ssize_t size; + char *ptr; + int cpu; + + /* Copy section for each CPU (we discard the original) */ + size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. - */ - ret = setup_pcpu_remap(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; + pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); - /* alrighty, percpu areas up and running */ - delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; +#ifndef CONFIG_NEED_MULTIPLE_NODES + ptr = alloc_bootmem_pages(size); +#else + int node = early_cpu_to_node(cpu); + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(size); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } else { + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); + pr_debug("per cpu data for cpu%d on node%d at %016lx\n", + cpu, node, __pa(ptr)); + } +#endif + + memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); + per_cpu_offset(cpu) = ptr - __per_cpu_start; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); @@ -438,6 +125,8 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); + + DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ diff --git a/trunk/arch/x86/math-emu/fpu_aux.c b/trunk/arch/x86/math-emu/fpu_aux.c index aa0987088774..491e737ce547 100644 --- a/trunk/arch/x86/math-emu/fpu_aux.c +++ b/trunk/arch/x86/math-emu/fpu_aux.c @@ -30,29 +30,20 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit_task(struct task_struct *tsk) +void finit(void) { - struct i387_soft_struct *soft = &tsk->thread.xstate->soft; - struct address *oaddr, *iaddr; - soft->cwd = 0x037f; - soft->swd = 0; - soft->ftop = 0; /* We don't keep top in the status word internally. */ - soft->twd = 0xffff; + control_word = 0x037f; + partial_status = 0; + top = 0; /* We don't keep top in the status word internally. */ + fpu_tag_word = 0xffff; /* The behaviour is different from that detailed in Section 15.1.6 of the Intel manual */ - oaddr = (struct address *)&soft->foo; - oaddr->offset = 0; - oaddr->selector = 0; - iaddr = (struct address *)&soft->fip; - iaddr->offset = 0; - iaddr->selector = 0; - iaddr->opcode = 0; - soft->no_update = 1; -} - -void finit(void) -{ - finit_task(current); + operand_address.offset = 0; + operand_address.selector = 0; + instruction_address.offset = 0; + instruction_address.selector = 0; + instruction_address.opcode = 0; + no_ip_update = 1; } /* diff --git a/trunk/arch/x86/mm/init.c b/trunk/arch/x86/mm/init.c index ce6a722587d8..f89df52683c5 100644 --- a/trunk/arch/x86/mm/init.c +++ b/trunk/arch/x86/mm/init.c @@ -1,9 +1,33 @@ +#include #include + #include #include +#include #include #include +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr = begin; diff --git a/trunk/arch/x86/mm/init_32.c b/trunk/arch/x86/mm/init_32.c index 47df0e1bbeb9..917c4e60c767 100644 --- a/trunk/arch/x86/mm/init_32.c +++ b/trunk/arch/x86/mm/init_32.c @@ -135,23 +135,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } -pmd_t * __init populate_extra_pmd(unsigned long vaddr) -{ - int pgd_idx = pgd_index(vaddr); - int pmd_idx = pmd_index(vaddr); - - return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; -} - -pte_t * __init populate_extra_pte(unsigned long vaddr) -{ - int pte_idx = pte_index(vaddr); - pmd_t *pmd; - - pmd = populate_extra_pmd(vaddr); - return one_page_table_init(pmd) + pte_idx; -} - static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { @@ -371,27 +354,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, } } -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - pte_t *kmap_pte; pgprot_t kmap_prot; diff --git a/trunk/arch/x86/mm/init_64.c b/trunk/arch/x86/mm/init_64.c index 07f44d491df1..074435e79824 100644 --- a/trunk/arch/x86/mm/init_64.c +++ b/trunk/arch/x86/mm/init_64.c @@ -168,51 +168,34 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +void +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) { - if (pgd_none(*pgd)) { - pud_t *pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) - printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); - } - return pud_offset(pgd, vaddr); -} + pud_t *pud; + pmd_t *pmd; + pte_t *pte; -static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) -{ + pud = pud_page + pud_index(vaddr); if (pud_none(*pud)) { - pmd_t *pmd = (pmd_t *) spp_getpage(); + pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) + if (pmd != pmd_offset(pud, 0)) { printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); + pmd, pmd_offset(pud, 0)); + return; + } } - return pmd_offset(pud, vaddr); -} - -static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) -{ + pmd = pmd_offset(pud, vaddr); if (pmd_none(*pmd)) { - pte_t *pte = (pte_t *) spp_getpage(); + pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) + if (pte != pte_offset_kernel(pmd, 0)) { printk(KERN_ERR "PAGETABLE BUG #02!\n"); + return; + } } - return pte_offset_kernel(pmd, vaddr); -} - -void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -222,7 +205,8 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void +set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -239,24 +223,6 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } -pmd_t * __init populate_extra_pmd(unsigned long vaddr) -{ - pgd_t *pgd; - pud_t *pud; - - pgd = pgd_offset_k(vaddr); - pud = fill_pud(pgd, vaddr); - return fill_pmd(pud, vaddr); -} - -pte_t * __init populate_extra_pte(unsigned long vaddr) -{ - pmd_t *pmd; - - pmd = populate_extra_pmd(vaddr); - return fill_pte(pmd, vaddr); -} - /* * Create large page table mappings for a range of physical addresses. */ @@ -910,28 +876,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; diff --git a/trunk/arch/x86/mm/kmmio.c b/trunk/arch/x86/mm/kmmio.c index 9f205030d9aa..93d82038af4b 100644 --- a/trunk/arch/x86/mm/kmmio.c +++ b/trunk/arch/x86/mm/kmmio.c @@ -32,14 +32,11 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ - bool old_presence; /* page presence prior to arming */ - bool armed; /* * Number of times this page has been registered as a part * of a probe. If zero, page is disarmed and this may be freed. - * Used only by writers (RCU) and post_kmmio_handler(). - * Protected by kmmio_lock, when linked into kmmio_page_table. + * Used only by writers (RCU). */ int count; }; @@ -108,85 +105,57 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) -{ - pmdval_t v = pmd_val(*pmd); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; - set_pmd(pmd, __pmd(v)); -} - -static void set_pte_presence(pte_t *pte, bool present, bool *old) -{ - pteval_t v = pte_val(*pte); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; - set_pte_atomic(pte, __pte(v)); -} - -static int set_page_presence(unsigned long addr, bool present, bool *old) +static void set_page_present(unsigned long addr, bool present, + unsigned int *pglevel) { + pteval_t pteval; + pmdval_t pmdval; unsigned int level; + pmd_t *pmd; pte_t *pte = lookup_address(addr, &level); if (!pte) { pr_err("kmmio: no pte for page 0x%08lx\n", addr); - return -1; + return; } + if (pglevel) + *pglevel = level; + switch (level) { case PG_LEVEL_2M: - set_pmd_presence((pmd_t *)pte, present, old); + pmd = (pmd_t *)pte; + pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; + if (present) + pmdval |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(pmdval)); break; + case PG_LEVEL_4K: - set_pte_presence(pte, present, old); + pteval = pte_val(*pte) & ~_PAGE_PRESENT; + if (present) + pteval |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(pteval)); break; + default: pr_err("kmmio: unexpected page level 0x%x.\n", level); - return -1; + return; } __flush_tlb_one(addr); - return 0; } -/* - * Mark the given page as not present. Access to it will trigger a fault. - * - * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the - * protection is ignored here. RCU read lock is assumed held, so the struct - * will not disappear unexpectedly. Furthermore, the caller must guarantee, - * that double arming the same virtual address (page) cannot occur. - * - * Double disarming on the other hand is allowed, and may occur when a fault - * and mmiotrace shutdown happen simultaneously. - */ -static int arm_kmmio_fault_page(struct kmmio_fault_page *f) +/** Mark the given page as not present. Access to it will trigger a fault. */ +static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) { - int ret; - WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); - if (f->armed) { - pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, f->old_presence); - } - ret = set_page_presence(f->page, false, &f->old_presence); - WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); - f->armed = true; - return ret; + set_page_present(page & PAGE_MASK, false, pglevel); } -/** Restore the given page to saved presence state. */ -static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) +/** Mark the given page as present. */ +static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) { - bool tmp; - int ret = set_page_presence(f->page, f->old_presence, &tmp); - WARN_ONCE(ret < 0, - KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); - f->armed = false; + set_page_present(page & PAGE_MASK, true, pglevel); } /* @@ -233,32 +202,28 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { + disarm_kmmio_fault_page(faultpage->page, NULL); if (addr == ctx->addr) { /* - * A second fault on the same page means some other - * condition needs handling by do_page_fault(), the - * page really not being present is the most common. + * On SMP we sometimes get recursive probe hits on the + * same address. Context is already saved, fall out. */ - pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", - addr, smp_processor_id()); - - if (!faultpage->old_presence) - pr_info("kmmio: unexpected secondary hit for " - "address 0x%08lx on CPU %d.\n", addr, - smp_processor_id()); - } else { - /* - * Prevent overwriting already in-flight context. - * This should not happen, let's hope disarming at - * least prevents a panic. - */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " + pr_debug("kmmio: duplicate probe hit on CPU %d, for " + "address 0x%08lx.\n", + smp_processor_id(), addr); + ret = 1; + goto no_kmmio_ctx; + } + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at least + * prevents a panic. + */ + pr_emerg("kmmio: recursive probe hit on CPU %d, " "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); - disarm_kmmio_fault_page(faultpage); - } + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); goto no_kmmio_ctx; } ctx->active++; @@ -279,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) regs->flags &= ~X86_EFLAGS_IF; /* Now we set present bit in PTE and single step. */ - disarm_kmmio_fault_page(ctx->fpage); + disarm_kmmio_fault_page(ctx->fpage->page, NULL); /* * If another cpu accesses the same page while we are stepping, @@ -310,7 +275,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_warning("kmmio: spurious debug trap on CPU %d.\n", + pr_debug("kmmio: spurious debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -318,11 +283,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - /* Prevent racing against release_kmmio_fault_page(). */ - spin_lock(&kmmio_lock); - if (ctx->fpage->count) - arm_kmmio_fault_page(ctx->fpage); - spin_unlock(&kmmio_lock); + arm_kmmio_fault_page(ctx->fpage->page, NULL); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; @@ -354,25 +315,21 @@ static int add_kmmio_fault_page(unsigned long page) f = get_kmmio_fault_page(page); if (f) { if (!f->count) - arm_kmmio_fault_page(f); + arm_kmmio_fault_page(f->page, NULL); f->count++; return 0; } - f = kzalloc(sizeof(*f), GFP_ATOMIC); + f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -1; f->count = 1; f->page = page; - - if (arm_kmmio_fault_page(f)) { - kfree(f); - return -1; - } - list_add_rcu(&f->list, kmmio_page_list(f->page)); + arm_kmmio_fault_page(f->page, NULL); + return 0; } @@ -390,7 +347,7 @@ static void release_kmmio_fault_page(unsigned long page, f->count--; BUG_ON(f->count < 0); if (!f->count) { - disarm_kmmio_fault_page(f); + disarm_kmmio_fault_page(f->page, NULL); f->release_next = *release_list; *release_list = f; } diff --git a/trunk/arch/x86/mm/testmmiotrace.c b/trunk/arch/x86/mm/testmmiotrace.c index 427fd1b56df5..ab50a8d7402c 100644 --- a/trunk/arch/x86/mm/testmmiotrace.c +++ b/trunk/arch/x86/mm/testmmiotrace.c @@ -1,5 +1,5 @@ /* - * Written by Pekka Paalanen, 2008-2009 + * Written by Pekka Paalanen, 2008 */ #include #include @@ -9,74 +9,35 @@ static unsigned long mmio_address; module_param(mmio_address, ulong, 0); -MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " - "(or 8 MB if read_far is non-zero)."); - -static unsigned long read_far = 0x400100; -module_param(read_far, ulong, 0); -MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " - "(default: 0x400100)."); - -static unsigned v16(unsigned i) -{ - return i * 12 + 7; -} - -static unsigned v32(unsigned i) -{ - return i * 212371 + 13; -} +MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); static void do_write_test(void __iomem *p) { unsigned int i; - pr_info(MODULE_NAME ": write test.\n"); mmiotrace_printk("Write test.\n"); - for (i = 0; i < 256; i++) iowrite8(i, p + i); - for (i = 1024; i < (5 * 1024); i += 2) - iowrite16(v16(i), p + i); - + iowrite16(i * 12 + 7, p + i); for (i = (5 * 1024); i < (16 * 1024); i += 4) - iowrite32(v32(i), p + i); + iowrite32(i * 212371 + 13, p + i); } static void do_read_test(void __iomem *p) { unsigned int i; - unsigned errs[3] = { 0 }; - pr_info(MODULE_NAME ": read test.\n"); mmiotrace_printk("Read test.\n"); - for (i = 0; i < 256; i++) - if (ioread8(p + i) != i) - ++errs[0]; - + ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) - if (ioread16(p + i) != v16(i)) - ++errs[1]; - + ioread16(p + i); for (i = (5 * 1024); i < (16 * 1024); i += 4) - if (ioread32(p + i) != v32(i)) - ++errs[2]; - - mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", - errs[0], errs[1], errs[2]); + ioread32(p + i); } -static void do_read_far_test(void __iomem *p) +static void do_test(void) { - pr_info(MODULE_NAME ": read far test.\n"); - mmiotrace_printk("Read far test.\n"); - - ioread32(p + read_far); -} - -static void do_test(unsigned long size) -{ - void __iomem *p = ioremap_nocache(mmio_address, size); + void __iomem *p = ioremap_nocache(mmio_address, 0x4000); if (!p) { pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; @@ -84,15 +45,11 @@ static void do_test(unsigned long size) mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); - if (read_far && read_far < size - 4) - do_read_far_test(p); iounmap(p); } static int __init init(void) { - unsigned long size = (read_far) ? (8 << 20) : (16 << 10); - if (mmio_address == 0) { pr_err(MODULE_NAME ": you have to use the module argument " "mmio_address.\n"); @@ -101,11 +58,10 @@ static int __init init(void) return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " - "address space, and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); - do_test(size); - pr_info(MODULE_NAME ": All done.\n"); + pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + "in PCI address space, and writing " + "rubbish in there.\n", mmio_address); + do_test(); return 0; } diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c index 82cd39a6cbd3..c52f4034c7fd 100644 --- a/trunk/arch/x86/xen/enlighten.c +++ b/trunk/arch/x86/xen/enlighten.c @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); + info.mfn = virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", @@ -301,10 +301,8 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = arbitrary_virt_to_mfn((void *)va); - + frames[f] = virt_to_mfn(va); make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(mfn_to_virt(frames[f])); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); @@ -316,7 +314,7 @@ static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); struct multicall_space mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); @@ -490,7 +488,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); + xmaddr_t maddr = virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) diff --git a/trunk/arch/x86/xen/mmu.c b/trunk/arch/x86/xen/mmu.c index cb6afa4ec95c..319bd40a57c2 100644 --- a/trunk/arch/x86/xen/mmu.c +++ b/trunk/arch/x86/xen/mmu.c @@ -276,13 +276,6 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } -unsigned long arbitrary_virt_to_mfn(void *vaddr) -{ - xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); - - return PFN_DOWN(maddr.maddr); -} - xmaddr_t arbitrary_virt_to_machine(void *vaddr) { unsigned long address = (unsigned long)vaddr; diff --git a/trunk/arch/x86/xen/smp.c b/trunk/arch/x86/xen/smp.c index 8d470562ffc9..035582ae815d 100644 --- a/trunk/arch/x86/xen/smp.c +++ b/trunk/arch/x86/xen/smp.c @@ -219,7 +219,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; struct desc_struct *gdt; - unsigned long gdt_mfn; if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -249,12 +248,9 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; BUG_ON((unsigned long)gdt & ~PAGE_MASK); - - gdt_mfn = arbitrary_virt_to_mfn(gdt); make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_frames[0] = virt_to_mfn(gdt); ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; diff --git a/trunk/block/blktrace.c b/trunk/block/blktrace.c index 028120a0965a..7cf9d1ff45a0 100644 --- a/trunk/block/blktrace.c +++ b/trunk/block/blktrace.c @@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); if (!bt->msg_data) goto err; diff --git a/trunk/crypto/api.c b/trunk/crypto/api.c index 38a2bc02a98c..efe77df6863f 100644 --- a/trunk/crypto/api.c +++ b/trunk/crypto/api.c @@ -215,19 +215,8 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); type &= mask; - alg = crypto_alg_lookup(name, type, mask); - if (!alg) { - char tmp[CRYPTO_MAX_ALG_NAME]; - - request_module(name); - - if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) && - snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp)) - request_module(tmp); - - alg = crypto_alg_lookup(name, type, mask); - } - + alg = try_then_request_module(crypto_alg_lookup(name, type, mask), + name); if (alg) return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg; diff --git a/trunk/drivers/acpi/processor_perflib.c b/trunk/drivers/acpi/processor_perflib.c index 68fd3d292799..9cc769b587ff 100644 --- a/trunk/drivers/acpi/processor_perflib.c +++ b/trunk/drivers/acpi/processor_perflib.c @@ -516,12 +516,12 @@ int acpi_processor_preregister_performance( continue; } - if (!performance || !per_cpu_ptr(performance, i)) { + if (!performance || !percpu_ptr(performance, i)) { retval = -EINVAL; continue; } - pr->performance = per_cpu_ptr(performance, i); + pr->performance = percpu_ptr(performance, i); cpumask_set_cpu(i, pr->performance->shared_cpu_map); if (acpi_processor_get_psd(pr)) { retval = -EINVAL; diff --git a/trunk/drivers/crypto/ixp4xx_crypto.c b/trunk/drivers/crypto/ixp4xx_crypto.c index d9e751be8c5f..2d637e0fbc03 100644 --- a/trunk/drivers/crypto/ixp4xx_crypto.c +++ b/trunk/drivers/crypto/ixp4xx_crypto.c @@ -457,12 +457,10 @@ static int init_ixp_crypto(void) if (!ctx_pool) { goto err; } - ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0, - "ixp_crypto:out", NULL); + ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0); if (ret) goto err; - ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0, - "ixp_crypto:in", NULL); + ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0); if (ret) { qmgr_release_queue(SEND_QID); goto err; diff --git a/trunk/drivers/crypto/padlock-aes.c b/trunk/drivers/crypto/padlock-aes.c index 3f0fdd18255d..856b3cc25583 100644 --- a/trunk/drivers/crypto/padlock-aes.c +++ b/trunk/drivers/crypto/padlock-aes.c @@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig"); -MODULE_ALIAS("aes-all"); +MODULE_ALIAS("aes"); diff --git a/trunk/drivers/crypto/padlock-sha.c b/trunk/drivers/crypto/padlock-sha.c index a2c8e8514b63..a7fbadebf623 100644 --- a/trunk/drivers/crypto/padlock-sha.c +++ b/trunk/drivers/crypto/padlock-sha.c @@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support."); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig"); -MODULE_ALIAS("sha1-all"); -MODULE_ALIAS("sha256-all"); +MODULE_ALIAS("sha1"); +MODULE_ALIAS("sha256"); MODULE_ALIAS("sha1-padlock"); MODULE_ALIAS("sha256-padlock"); diff --git a/trunk/drivers/dma/iop-adma.c b/trunk/drivers/dma/iop-adma.c index 647374acba94..ea5440dd10dc 100644 --- a/trunk/drivers/dma/iop-adma.c +++ b/trunk/drivers/dma/iop-adma.c @@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma"); static struct platform_driver iop_adma_driver = { .probe = iop_adma_probe, - .remove = __devexit_p(iop_adma_remove), + .remove = iop_adma_remove, .driver = { .owner = THIS_MODULE, .name = "iop-adma", diff --git a/trunk/drivers/dma/mv_xor.c b/trunk/drivers/dma/mv_xor.c index 5d5d5b31867f..d35cbd1ff0b3 100644 --- a/trunk/drivers/dma/mv_xor.c +++ b/trunk/drivers/dma/mv_xor.c @@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp, static struct platform_driver mv_xor_driver = { .probe = mv_xor_probe, - .remove = __devexit_p(mv_xor_remove), + .remove = mv_xor_remove, .driver = { .owner = THIS_MODULE, .name = MV_XOR_NAME, diff --git a/trunk/drivers/gpu/drm/drm_stub.c b/trunk/drivers/gpu/drm/drm_stub.c index 7c8b15b22bf2..096e2a37446d 100644 --- a/trunk/drivers/gpu/drm/drm_stub.c +++ b/trunk/drivers/gpu/drm/drm_stub.c @@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data, file_priv->minor->master != file_priv->master) { mutex_lock(&dev->struct_mutex); file_priv->minor->master = drm_master_get(file_priv->master); - mutex_unlock(&dev->struct_mutex); + mutex_lock(&dev->struct_mutex); } return 0; diff --git a/trunk/drivers/i2c/busses/i2c-mv64xxx.c b/trunk/drivers/i2c/busses/i2c-mv64xxx.c index 7f186bbcb99d..eeda276f8f16 100644 --- a/trunk/drivers/i2c/busses/i2c-mv64xxx.c +++ b/trunk/drivers/i2c/busses/i2c-mv64xxx.c @@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd, return 0; } -static void +static void __devexit mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data) { if (drv_data->reg_base) { @@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev) static struct platform_driver mv64xxx_i2c_driver = { .probe = mv64xxx_i2c_probe, - .remove = __devexit_p(mv64xxx_i2c_remove), + .remove = mv64xxx_i2c_remove, .driver = { .owner = THIS_MODULE, .name = MV64XXX_I2C_CTLR_NAME, diff --git a/trunk/drivers/mtd/nand/orion_nand.c b/trunk/drivers/mtd/nand/orion_nand.c index c2dfd3ea353d..917cf8d3ae95 100644 --- a/trunk/drivers/mtd/nand/orion_nand.c +++ b/trunk/drivers/mtd/nand/orion_nand.c @@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev) static struct platform_driver orion_nand_driver = { .probe = orion_nand_probe, - .remove = __devexit_p(orion_nand_remove), + .remove = orion_nand_remove, .driver = { .name = "orion_nand", .owner = THIS_MODULE, diff --git a/trunk/drivers/net/arm/Makefile b/trunk/drivers/net/arm/Makefile index 811a3ccd14c1..c69c0cdba4a2 100644 --- a/trunk/drivers/net/arm/Makefile +++ b/trunk/drivers/net/arm/Makefile @@ -4,7 +4,7 @@ # obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o -obj-$(CONFIG_ARM_ETHERH) += etherh.o +obj-$(CONFIG_ARM_ETHERH) += etherh.o ../8390.o obj-$(CONFIG_ARM_ETHER3) += ether3.o obj-$(CONFIG_ARM_ETHER1) += ether1.o obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o diff --git a/trunk/drivers/net/arm/etherh.c b/trunk/drivers/net/arm/etherh.c index f52f668c49bf..54b52e5b1821 100644 --- a/trunk/drivers/net/arm/etherh.c +++ b/trunk/drivers/net/arm/etherh.c @@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = { .ndo_open = etherh_open, .ndo_stop = etherh_close, .ndo_set_config = etherh_set_config, - .ndo_start_xmit = __ei_start_xmit, - .ndo_tx_timeout = __ei_tx_timeout, - .ndo_get_stats = __ei_get_stats, - .ndo_set_multicast_list = __ei_set_multicast_list, + .ndo_start_xmit = ei_start_xmit, + .ndo_tx_timeout = ei_tx_timeout, + .ndo_get_stats = ei_get_stats, + .ndo_set_multicast_list = ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = __ei_poll, + .ndo_poll_controller = ei_poll, #endif }; diff --git a/trunk/drivers/video/pxafb.c b/trunk/drivers/video/pxafb.c index 2552b9f325ee..48ff701d3a72 100644 --- a/trunk/drivers/video/pxafb.c +++ b/trunk/drivers/video/pxafb.c @@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev) static struct platform_driver pxafb_driver = { .probe = pxafb_probe, - .remove = __devexit_p(pxafb_remove), + .remove = pxafb_remove, .suspend = pxafb_suspend, .resume = pxafb_resume, .driver = { diff --git a/trunk/include/linux/bootmem.h b/trunk/include/linux/bootmem.h index 455d83219fae..95837bfb5256 100644 --- a/trunk/include/linux/bootmem.h +++ b/trunk/include/linux/bootmem.h @@ -65,20 +65,23 @@ extern void free_bootmem(unsigned long addr, unsigned long size); #define BOOTMEM_DEFAULT 0 #define BOOTMEM_EXCLUSIVE (1<<0) -extern int reserve_bootmem(unsigned long addr, - unsigned long size, - int flags); extern int reserve_bootmem_node(pg_data_t *pgdat, - unsigned long physaddr, - unsigned long size, - int flags); + unsigned long physaddr, + unsigned long size, + int flags); +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); +#endif -extern void *__alloc_bootmem(unsigned long size, +extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal); -extern void *__alloc_bootmem_nopanic(unsigned long size, +extern void *__alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal); +extern void *__alloc_bootmem_low(unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -87,35 +90,30 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); -extern void *__alloc_bootmem_low(unsigned long size, - unsigned long align, - unsigned long goal); extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); - +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_nopanic(x) \ __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low(x) \ + __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_nopanic(x) \ __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_node_nopanic(pgdat, x) \ - __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) - -#define alloc_bootmem_low(x) \ - __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, int flags); diff --git a/trunk/include/linux/percpu.h b/trunk/include/linux/percpu.h index 545b068bcb70..3577ffd90d45 100644 --- a/trunk/include/linux/percpu.h +++ b/trunk/include/linux/percpu.h @@ -76,98 +76,52 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA - -/* minimum unit size, also is the maximum supported allocation size */ -#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT) - -/* - * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy - * back on the first chunk if arch is manually allocating and mapping - * it for faster access (as a part of large page mapping for example). - * Note that dynamic percpu allocator covers both static and dynamic - * areas, so these values are bigger than PERCPU_MODULE_RESERVE. - * - * On typical configuration with modules, the following values leave - * about 8k of free space on the first chunk after boot on both x86_32 - * and 64 when module support is enabled. When module support is - * disabled, it's much tighter. - */ -#ifndef PERCPU_DYNAMIC_RESERVE -# if BITS_PER_LONG > 32 -# ifdef CONFIG_MODULES -# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT) -# else -# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) -# endif -# else -# ifdef CONFIG_MODULES -# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) -# else -# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT) -# endif -# endif -#endif /* PERCPU_DYNAMIC_RESERVE */ - -extern void *pcpu_base_addr; - -typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); - -extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t unit_size, - size_t free_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); - -/* - * Use this to get to a cpu's version of the per-cpu object - * dynamically allocated. Non-atomic access to the current CPU's - * version should probably be combined with get_cpu()/put_cpu(). - */ -#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) - -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ - struct percpu_data { void *ptrs[1]; }; #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) - -#define per_cpu_ptr(ptr, cpu) \ -({ \ - struct percpu_data *__p = __percpu_disguise(ptr); \ - (__typeof__(ptr))__p->ptrs[(cpu)]; \ +/* + * Use this to get to a cpu's version of the per-cpu object dynamically + * allocated. Non-atomic access to the current CPU's version should + * probably be combined with get_cpu()/put_cpu(). + */ +#define percpu_ptr(ptr, cpu) \ +({ \ + struct percpu_data *__p = __percpu_disguise(ptr); \ + (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ - -extern void *__alloc_percpu(size_t size, size_t align); -extern void free_percpu(void *__pdata); +extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); +extern void percpu_free(void *__pdata); #else /* CONFIG_SMP */ -#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) -static inline void *__alloc_percpu(size_t size, size_t align) +static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - return kzalloc(size, GFP_KERNEL); + return kzalloc(size, gfp); } -static inline void free_percpu(void *p) +static inline void percpu_free(void *__pdata) { - kfree(p); + kfree(__pdata); } #endif /* CONFIG_SMP */ -#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ - __alignof__(type)) +#define percpu_alloc_mask(size, gfp, mask) \ + __percpu_alloc_mask((size), (gfp), &(mask)) + +#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map) + +/* (legacy) interface for use without CPU hotplug handling */ + +#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \ + cpu_possible_map) +#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) +#define free_percpu(ptr) percpu_free((ptr)) +#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) #endif /* __LINUX_PERCPU_H */ diff --git a/trunk/include/linux/rcuclassic.h b/trunk/include/linux/rcuclassic.h index 80044a4f3ab9..f3f697df1d71 100644 --- a/trunk/include/linux/rcuclassic.h +++ b/trunk/include/linux/rcuclassic.h @@ -181,10 +181,4 @@ extern long rcu_batches_completed_bh(void); #define rcu_enter_nohz() do { } while (0) #define rcu_exit_nohz() do { } while (0) -/* A context switch is a grace period for rcuclassic. */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1; -} - #endif /* __LINUX_RCUCLASSIC_H */ diff --git a/trunk/include/linux/rcupdate.h b/trunk/include/linux/rcupdate.h index 528343e6da51..921340a7b71c 100644 --- a/trunk/include/linux/rcupdate.h +++ b/trunk/include/linux/rcupdate.h @@ -52,9 +52,6 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -/* Internal to kernel, but needed by rcupreempt.h. */ -extern int rcu_scheduler_active; - #if defined(CONFIG_CLASSIC_RCU) #include #elif defined(CONFIG_TREE_RCU) @@ -268,7 +265,6 @@ extern void rcu_barrier_sched(void); /* Internal to kernel */ extern void rcu_init(void); -extern void rcu_scheduler_starting(void); extern int rcu_needs_cpu(int cpu); #endif /* __LINUX_RCUPDATE_H */ diff --git a/trunk/include/linux/rcupreempt.h b/trunk/include/linux/rcupreempt.h index 74304b4538d8..3e05c09b54a2 100644 --- a/trunk/include/linux/rcupreempt.h +++ b/trunk/include/linux/rcupreempt.h @@ -142,19 +142,4 @@ static inline void rcu_exit_nohz(void) #define rcu_exit_nohz() do { } while (0) #endif /* CONFIG_NO_HZ */ -/* - * A context switch is a grace period for rcupreempt synchronize_rcu() - * only during early boot, before the scheduler has been initialized. - * So, how the heck do we get a context switch? Well, if the caller - * invokes synchronize_rcu(), they are willing to accept a context - * switch, so we simply pretend that one happened. - * - * After boot, there might be a blocked or preempted task in an RCU - * read-side critical section, so we cannot then take the fastpath. - */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1 && !rcu_scheduler_active; -} - #endif /* __LINUX_RCUPREEMPT_H */ diff --git a/trunk/include/linux/rcutree.h b/trunk/include/linux/rcutree.h index a722fb67bb2d..d4368b7975c3 100644 --- a/trunk/include/linux/rcutree.h +++ b/trunk/include/linux/rcutree.h @@ -326,10 +326,4 @@ static inline void rcu_exit_nohz(void) } #endif /* CONFIG_NO_HZ */ -/* A context switch is a grace period for rcutree. */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1; -} - #endif /* __LINUX_RCUTREE_H */ diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index a7c7698583bb..f0a50b20e8a0 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -2303,13 +2303,9 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); -extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif -extern int task_can_switch_user(struct user_struct *up, - struct task_struct *tsk); - #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/trunk/include/linux/vmalloc.h b/trunk/include/linux/vmalloc.h index a43ebec3a7b9..9c0890c7a06a 100644 --- a/trunk/include/linux/vmalloc.h +++ b/trunk/include/linux/vmalloc.h @@ -95,9 +95,6 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); -extern int map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages); -extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); /* Allocate/destroy a 'vmalloc' VM area. */ @@ -113,6 +110,5 @@ extern long vwrite(char *buf, char *addr, unsigned long count); */ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; -extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #endif /* _LINUX_VMALLOC_H */ diff --git a/trunk/init/main.c b/trunk/init/main.c index 6bf83afd654d..6441083f8273 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -98,7 +98,7 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif -enum system_states system_state __read_mostly; +enum system_states system_state; EXPORT_SYMBOL(system_state); /* @@ -464,7 +464,6 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - rcu_scheduler_starting(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/trunk/kernel/module.c b/trunk/kernel/module.c index 1f0657ae555b..ba22484a987e 100644 --- a/trunk/kernel/module.c +++ b/trunk/kernel/module.c @@ -51,7 +51,6 @@ #include #include #include -#include #if 0 #define DEBUGP printk @@ -367,34 +366,6 @@ static struct module *find_module(const char *name) } #ifdef CONFIG_SMP - -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA - -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) -{ - void *ptr; - - if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - ptr = __alloc_percpu(size, align); - if (!ptr) - printk(KERN_WARNING - "Could not allocate %lu bytes percpu data\n", size); - return ptr; -} - -static void percpu_modfree(void *freeme) -{ - free_percpu(freeme); -} - -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ - /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; /* Size of each block. -ve means used. */ @@ -509,6 +480,21 @@ static void percpu_modfree(void *freeme) } } +static unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} + static int percpu_modinit(void) { pcpu_num_used = 2; @@ -527,26 +513,7 @@ static int percpu_modinit(void) return 0; } __initcall(percpu_modinit); - -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ - -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} - #else /* ... !CONFIG_SMP */ - static inline void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -568,7 +535,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } - #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ diff --git a/trunk/kernel/rcuclassic.c b/trunk/kernel/rcuclassic.c index 654c640a6b9c..bd5a9003497c 100644 --- a/trunk/kernel/rcuclassic.c +++ b/trunk/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/trunk/kernel/rcupdate.c b/trunk/kernel/rcupdate.c index cae8a059cf47..d92a76a881aa 100644 --- a/trunk/kernel/rcupdate.c +++ b/trunk/kernel/rcupdate.c @@ -44,7 +44,6 @@ #include #include #include -#include enum rcu_barrier { RCU_BARRIER_STD, @@ -56,7 +55,6 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -int rcu_scheduler_active __read_mostly; /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -82,10 +80,6 @@ void wakeme_after_rcu(struct rcu_head *head) void synchronize_rcu(void) { struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); @@ -181,9 +175,3 @@ void __init rcu_init(void) __rcu_init(); } -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} diff --git a/trunk/kernel/rcupreempt.c b/trunk/kernel/rcupreempt.c index 5d59e850fb71..33cfc50781f9 100644 --- a/trunk/kernel/rcupreempt.c +++ b/trunk/kernel/rcupreempt.c @@ -1181,9 +1181,6 @@ void __synchronize_sched(void) { struct rcu_synchronize rcu; - if (num_online_cpus() == 1) - return; /* blocking is gp if only one CPU! */ - init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/trunk/kernel/rcutree.c b/trunk/kernel/rcutree.c index 97ce31579ec0..b2fd602a6f6f 100644 --- a/trunk/kernel/rcutree.c +++ b/trunk/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c index 0a76d0b6f215..7d97ff7c4478 100644 --- a/trunk/kernel/sched.c +++ b/trunk/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -9219,16 +9219,6 @@ static int sched_rt_global_constraints(void) return ret; } - -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) -{ - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) - return 0; - - return 1; -} - #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9322,7 +9312,8 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -9485,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9504,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9600,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/trunk/kernel/stop_machine.c b/trunk/kernel/stop_machine.c index 74541ca49536..0cd415ee62a2 100644 --- a/trunk/kernel/stop_machine.c +++ b/trunk/kernel/stop_machine.c @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) * doesn't hit this CPU until we're ready. */ get_cpu(); for_each_online_cpu(i) { - sm_work = per_cpu_ptr(stop_machine_work, i); + sm_work = percpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } diff --git a/trunk/kernel/sys.c b/trunk/kernel/sys.c index 37f458e6882a..f145c415bc16 100644 --- a/trunk/kernel/sys.c +++ b/trunk/kernel/sys.c @@ -559,7 +559,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,11 +571,6 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; - if (!task_can_switch_user(new_user, current)) { - free_uid(new_user); - return -EINVAL; - } - if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -636,11 +631,10 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - if (new->uid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } + retval = -EAGAIN; + if (new->uid != old->uid && set_user(new) < 0) + goto error; + if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -686,10 +680,9 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; + if (uid != old->uid && set_user(new) < 0) { + retval = -EAGAIN; + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -741,13 +734,11 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } + retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } + if (ruid != old->uid && set_user(new) < 0) + goto error; } if (euid != (uid_t) -1) new->euid = euid; diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c index 6a9b696128c8..3551ac742395 100644 --- a/trunk/kernel/user.c +++ b/trunk/kernel/user.c @@ -362,24 +362,6 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif -#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) -/* - * We need to check if a setuid can take place. This function should be called - * before successfully completing the setuid. - */ -int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) -{ - - return sched_rt_can_attach(up->tg, tsk); - -} -#else -int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) -{ - return 1; -} -#endif - /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). diff --git a/trunk/mm/Makefile b/trunk/mm/Makefile index 818569b68f46..72255be57f89 100644 --- a/trunk/mm/Makefile +++ b/trunk/mm/Makefile @@ -30,10 +30,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA -obj-$(CONFIG_SMP) += percpu.o -else obj-$(CONFIG_SMP) += allocpercpu.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/trunk/mm/allocpercpu.c b/trunk/mm/allocpercpu.c index 3653c570232b..4297bc41bfd2 100644 --- a/trunk/mm/allocpercpu.c +++ b/trunk/mm/allocpercpu.c @@ -99,51 +99,45 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** - * alloc_percpu - initial setup of per-cpu data + * percpu_alloc_mask - initial setup of per-cpu data * @size: size of per-cpu object - * @align: alignment + * @gfp: may sleep or not etc. + * @mask: populate per-data for cpu's selected through mask bits * - * Allocate dynamic percpu area. Percpu objects are populated with - * zeroed buffers. + * Populating per-cpu data for all online cpu's would be a typical use case, + * which is simplified by the percpu_alloc() wrapper. + * Per-cpu objects are populated with zeroed buffers. */ -void *__alloc_percpu(size_t size, size_t align) +void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { /* * We allocate whole cache lines to avoid false sharing */ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, GFP_KERNEL); + void *pdata = kzalloc(sz, gfp); void *__pdata = __percpu_disguise(pdata); - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > __alignof__(unsigned long long)); - if (unlikely(!pdata)) return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, - &cpu_possible_map))) + if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) return __pdata; kfree(pdata); return NULL; } -EXPORT_SYMBOL_GPL(__alloc_percpu); +EXPORT_SYMBOL_GPL(__percpu_alloc_mask); /** - * free_percpu - final cleanup of per-cpu data + * percpu_free - final cleanup of per-cpu data * @__pdata: object to clean up * * We simply clean up any per-cpu object left. No need for the client to * track and specify through a bis mask which per-cpu objects are to free. */ -void free_percpu(void *__pdata) +void percpu_free(void *__pdata) { if (unlikely(!__pdata)) return; __percpu_depopulate_mask(__pdata, &cpu_possible_map); kfree(__percpu_disguise(__pdata)); } -EXPORT_SYMBOL_GPL(free_percpu); +EXPORT_SYMBOL_GPL(percpu_free); diff --git a/trunk/mm/bootmem.c b/trunk/mm/bootmem.c index daf92713f7de..51a0ccf61e0e 100644 --- a/trunk/mm/bootmem.c +++ b/trunk/mm/bootmem.c @@ -382,6 +382,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); } +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE /** * reserve_bootmem - mark a page range as usable * @addr: starting address of the range @@ -402,6 +403,7 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) @@ -427,8 +429,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, } static void * __init alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; @@ -528,34 +530,17 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, return NULL; } -static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ -#ifdef CONFIG_HAVE_ARCH_BOOTMEM - bootmem_data_t *p_bdata; - - p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); - if (p_bdata) - return alloc_bootmem_core(p_bdata, size, align, goal, limit); -#endif - return NULL; -} - static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { bootmem_data_t *bdata; - void *region; restart: - region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); - if (region) - return region; - list_for_each_entry(bdata, &bdata_list, list) { + void *region; + if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) @@ -633,10 +618,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, { void *ptr; - ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); - if (ptr) - return ptr; - ptr = alloc_bootmem_core(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -693,10 +674,6 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; - ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; diff --git a/trunk/mm/percpu.c b/trunk/mm/percpu.c deleted file mode 100644 index 3d0f5456827c..000000000000 --- a/trunk/mm/percpu.c +++ /dev/null @@ -1,979 +0,0 @@ -/* - * linux/mm/percpu.c - percpu memory allocator - * - * Copyright (C) 2009 SUSE Linux Products GmbH - * Copyright (C) 2009 Tejun Heo - * - * This file is released under the GPLv2. - * - * This is percpu allocator which can handle both static and dynamic - * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area - * - * c0 c1 c2 - * ------------------- ------------------- ------------ - * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u - * ------------------- ...... ------------------- .... ------------ - * - * Allocation is done in offset-size areas of single unit space. Ie, - * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers UNIT_SIZE apart. - * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists - * according to free size and tries to allocate from the fullest one. - * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be eqaul to or larger than the maximum contiguous - * area in the chunk. This helps the allocator not to iterate the - * chunk maps unnecessarily. - * - * Allocation state in each chunk is kept using an array of integers - * on chunk->map. A positive value in the map represents a free - * region and negative allocated. Allocation inside a chunk is done - * by scanning this map sequentially and serving the first matching - * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks are also linked into a rb tree to ease address to chunk - * mapping during free. - * - * To use this allocator, arch code should do the followings. - * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA - * - * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate - * regular address to percpu pointer and back - * - * - use pcpu_setup_first_chunk() during percpu area initialization to - * setup the first chunk containing the kernel static percpu area - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ -#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ - -struct pcpu_chunk { - struct list_head list; /* linked to pcpu_slot lists */ - struct rb_node rb_node; /* key is chunk->vm->addr */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ - struct vm_struct *vm; /* mapped vmalloc region */ - int map_used; /* # of map entries used */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - bool immutable; /* no [de]population allowed */ - struct page *page[]; /* #cpus * UNIT_PAGES */ -}; - -static int pcpu_unit_pages __read_mostly; -static int pcpu_unit_size __read_mostly; -static int pcpu_chunk_size __read_mostly; -static int pcpu_nr_slots __read_mostly; -static size_t pcpu_chunk_struct_size __read_mostly; - -/* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr __read_mostly; -EXPORT_SYMBOL_GPL(pcpu_base_addr); - -/* the size of kernel static area */ -static int pcpu_static_size __read_mostly; - -/* - * One mutex to rule them all. - * - * The following mutex is grabbed in the outermost public alloc/free - * interface functions and released only when the operation is - * complete. As such, every function in this file other than the - * outermost functions are called under pcpu_mutex. - * - * It can easily be switched to use spinlock such that only the area - * allocation and page population commit are protected with it doing - * actual [de]allocation without holding any lock. However, given - * what this allocator does, I think it's better to let them run - * sequentially. - */ -static DEFINE_MUTEX(pcpu_mutex); - -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ -static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ - -static int __pcpu_size_to_slot(int size) -{ - int highbit = fls(size); /* size is in bytes */ - return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); -} - -static int pcpu_size_to_slot(int size) -{ - if (size == pcpu_unit_size) - return pcpu_nr_slots - 1; - return __pcpu_size_to_slot(size); -} - -static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) -{ - if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) - return 0; - - return pcpu_size_to_slot(chunk->free_size); -} - -static int pcpu_page_idx(unsigned int cpu, int page_idx) -{ - return cpu * pcpu_unit_pages + page_idx; -} - -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; -} - -static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); -} - -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) -{ - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; -} - -/** - * pcpu_realloc - versatile realloc - * @p: the current pointer (can be NULL for new allocations) - * @size: the current size in bytes (can be 0 for new allocations) - * @new_size: the wanted new size in bytes (can be 0 for free) - * - * More robust realloc which can be used to allocate, resize or free a - * memory area of arbitrary size. If the needed size goes over - * PAGE_SIZE, kernel VM is used. - * - * RETURNS: - * The new pointer on success, NULL on failure. - */ -static void *pcpu_realloc(void *p, size_t size, size_t new_size) -{ - void *new; - - if (new_size <= PAGE_SIZE) - new = kmalloc(new_size, GFP_KERNEL); - else - new = vmalloc(new_size); - if (new_size && !new) - return NULL; - - memcpy(new, p, min(size, new_size)); - if (new_size > size) - memset(new + size, 0, new_size - size); - - if (size <= PAGE_SIZE) - kfree(p); - else - vfree(p); - - return new; -} - -/** - * pcpu_chunk_relocate - put chunk in the appropriate chunk slot - * @chunk: chunk of interest - * @oslot: the previous slot it was on - * - * This function is called after an allocation or free changed @chunk. - * New slot according to the changed state is determined and @chunk is - * moved to the slot. - */ -static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) -{ - int nslot = pcpu_chunk_slot(chunk); - - if (oslot != nslot) { - if (oslot < nslot) - list_move(&chunk->list, &pcpu_slot[nslot]); - else - list_move_tail(&chunk->list, &pcpu_slot[nslot]); - } -} - -static struct rb_node **pcpu_chunk_rb_search(void *addr, - struct rb_node **parentp) -{ - struct rb_node **p = &pcpu_addr_root.rb_node; - struct rb_node *parent = NULL; - struct pcpu_chunk *chunk; - - while (*p) { - parent = *p; - chunk = rb_entry(parent, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) - p = &(*p)->rb_left; - else if (addr > chunk->vm->addr) - p = &(*p)->rb_right; - else - break; - } - - if (parentp) - *parentp = parent; - return p; -} - -/** - * pcpu_chunk_addr_search - search for chunk containing specified address - * @addr: address to search for - * - * Look for chunk which might contain @addr. More specifically, it - * searchs for the chunk with the highest start address which isn't - * beyond @addr. - * - * RETURNS: - * The address of the found chunk. - */ -static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) -{ - struct rb_node *n, *parent; - struct pcpu_chunk *chunk; - - n = *pcpu_chunk_rb_search(addr, &parent); - if (!n) { - /* no exactly matching chunk, the parent is the closest */ - n = parent; - BUG_ON(!n); - } - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) { - /* the parent was the next one, look for the previous one */ - n = rb_prev(n); - BUG_ON(!n); - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - } - - return chunk; -} - -/** - * pcpu_chunk_addr_insert - insert chunk into address rb tree - * @new: chunk to insert - * - * Insert @new into address rb tree. - */ -static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) -{ - struct rb_node **p, *parent; - - p = pcpu_chunk_rb_search(new->vm->addr, &parent); - BUG_ON(*p); - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &pcpu_addr_root); -} - -/** - * pcpu_split_block - split a map block - * @chunk: chunk of interest - * @i: index of map block to split - * @head: head size in bytes (can be 0) - * @tail: tail size in bytes (can be 0) - * - * Split the @i'th map block into two or three blocks. If @head is - * non-zero, @head bytes block is inserted before block @i moving it - * to @i+1 and reducing its size by @head bytes. - * - * If @tail is non-zero, the target block, which can be @i or @i+1 - * depending on @head, is reduced by @tail bytes and @tail byte block - * is inserted after the target block. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) -{ - int nr_extra = !!head + !!tail; - int target = chunk->map_used + nr_extra; - - /* reallocation required? */ - if (chunk->map_alloc < target) { - int new_alloc = chunk->map_alloc; - int *new; - - while (new_alloc < target) - new_alloc *= 2; - - new = pcpu_realloc(chunk->map, - chunk->map_alloc * sizeof(new[0]), - new_alloc * sizeof(new[0])); - if (!new) - return -ENOMEM; - - chunk->map_alloc = new_alloc; - chunk->map = new; - } - - /* insert a new subblock */ - memmove(&chunk->map[i + nr_extra], &chunk->map[i], - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - chunk->map[i + 1] = chunk->map[i] - head; - chunk->map[i++] = head; - } - if (tail) { - chunk->map[i++] -= tail; - chunk->map[i] = tail; - } - return 0; -} - -/** - * pcpu_alloc_area - allocate area from a pcpu_chunk - * @chunk: chunk of interest - * @size: wanted size in bytes - * @align: wanted align - * - * Try to allocate @size bytes area aligned at @align from @chunk. - * Note that this function only allocates the offset. It doesn't - * populate or map the area. - * - * RETURNS: - * Allocated offset in @chunk on success, -errno on failure. - */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) -{ - int oslot = pcpu_chunk_slot(chunk); - int max_contig = 0; - int i, off; - - /* - * The static chunk initially doesn't have map attached - * because kmalloc wasn't available during init. Give it one. - */ - if (unlikely(!chunk->map)) { - chunk->map = pcpu_realloc(NULL, 0, - PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); - if (!chunk->map) - return -ENOMEM; - - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = -pcpu_static_size; - if (chunk->free_size) - chunk->map[chunk->map_used++] = chunk->free_size; - } - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { - bool is_last = i + 1 == chunk->map_used; - int head, tail; - - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - BUG_ON(i == 0 && head != 0); - - if (chunk->map[i] < 0) - continue; - if (chunk->map[i] < head + size) { - max_contig = max(chunk->map[i], max_contig); - continue; - } - - /* - * If head is small or the previous block is free, - * merge'em. Note that 'small' is defined as smaller - * than sizeof(int), which is very small but isn't too - * uncommon for percpu allocations. - */ - if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { - if (chunk->map[i - 1] > 0) - chunk->map[i - 1] += head; - else { - chunk->map[i - 1] -= head; - chunk->free_size -= head; - } - chunk->map[i] -= head; - off += head; - head = 0; - } - - /* if tail is small, just keep it around */ - tail = chunk->map[i] - head - size; - if (tail < sizeof(int)) - tail = 0; - - /* split if warranted */ - if (head || tail) { - if (pcpu_split_block(chunk, i, head, tail)) - return -ENOMEM; - if (head) { - i++; - off += head; - max_contig = max(chunk->map[i - 1], max_contig); - } - if (tail) - max_contig = max(chunk->map[i + 1], max_contig); - } - - /* update hint and mark allocated */ - if (is_last) - chunk->contig_hint = max_contig; /* fully scanned */ - else - chunk->contig_hint = max(chunk->contig_hint, - max_contig); - - chunk->free_size -= chunk->map[i]; - chunk->map[i] = -chunk->map[i]; - - pcpu_chunk_relocate(chunk, oslot); - return off; - } - - chunk->contig_hint = max_contig; /* fully scanned */ - pcpu_chunk_relocate(chunk, oslot); - - /* - * Tell the upper layer that this chunk has no area left. - * Note that this is not an error condition but a notification - * to upper layer that it needs to look at other chunks. - * -ENOSPC is chosen as it isn't used in memory subsystem and - * matches the meaning in a way. - */ - return -ENOSPC; -} - -/** - * pcpu_free_area - free area to a pcpu_chunk - * @chunk: chunk of interest - * @freeme: offset of area to free - * - * Free area starting from @freeme to @chunk. Note that this function - * only modifies the allocation map. It doesn't depopulate or unmap - * the area. - */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) -{ - int oslot = pcpu_chunk_slot(chunk); - int i, off; - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) - if (off == freeme) - break; - BUG_ON(off != freeme); - BUG_ON(chunk->map[i] > 0); - - chunk->map[i] = -chunk->map[i]; - chunk->free_size += chunk->map[i]; - - /* merge with previous? */ - if (i > 0 && chunk->map[i - 1] >= 0) { - chunk->map[i - 1] += chunk->map[i]; - chunk->map_used--; - memmove(&chunk->map[i], &chunk->map[i + 1], - (chunk->map_used - i) * sizeof(chunk->map[0])); - i--; - } - /* merge with next? */ - if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { - chunk->map[i] += chunk->map[i + 1]; - chunk->map_used--; - memmove(&chunk->map[i + 1], &chunk->map[i + 2], - (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); - } - - chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); - pcpu_chunk_relocate(chunk, oslot); -} - -/** - * pcpu_unmap - unmap pages out of a pcpu_chunk - * @chunk: chunk of interest - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * @flush: whether to flush cache and tlb or not - * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. - */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush) -{ - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); - - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - if (flush) - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); - - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); -} - -/** - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk - * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not - * - * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. - */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) -{ - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); - unsigned int cpu; - int i; - - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); - - if (!*pagep) - continue; - - __free_page(*pagep); - - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; - - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; - } - } - - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); -} - -/** - * pcpu_map - map pages into a pcpu_chunk - * @chunk: chunk of interest - * @page_start: page index of the first page to map - * @page_end: page index of the last page to map + 1 - * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. - */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) -{ - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); - - for_each_possible_cpu(cpu) { - err = map_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT, - PAGE_KERNEL, - pcpu_chunk_pagep(chunk, cpu, page_start)); - if (err < 0) - return err; - } - - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); - return 0; -} - -/** - * pcpu_populate_chunk - populate and map an area of a pcpu_chunk - * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes - * - * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. - */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) -{ - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); - unsigned int cpu; - int i; - - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } - - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; - - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); - - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - } - } - - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; - - for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); - - return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; -} - -static void free_pcpu_chunk(struct pcpu_chunk *chunk) -{ - if (!chunk) - return; - if (chunk->vm) - free_vm_area(chunk->vm); - pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); - kfree(chunk); -} - -static struct pcpu_chunk *alloc_pcpu_chunk(void) -{ - struct pcpu_chunk *chunk; - - chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); - if (!chunk) - return NULL; - - chunk->map = pcpu_realloc(NULL, 0, - PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; - - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); - if (!chunk->vm) { - free_pcpu_chunk(chunk); - return NULL; - } - - INIT_LIST_HEAD(&chunk->list); - chunk->free_size = pcpu_unit_size; - chunk->contig_hint = pcpu_unit_size; - - return chunk; -} - -/** - * __alloc_percpu - allocate percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void *__alloc_percpu(size_t size, size_t align) -{ - void *ptr = NULL; - struct pcpu_chunk *chunk; - int slot, off; - - if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { - WARN(true, "illegal size (%zu) or align (%zu) for " - "percpu allocation\n", size, align); - return NULL; - } - - mutex_lock(&pcpu_mutex); - - /* allocate area */ - for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { - list_for_each_entry(chunk, &pcpu_slot[slot], list) { - if (size > chunk->contig_hint) - continue; - off = pcpu_alloc_area(chunk, size, align); - if (off >= 0) - goto area_found; - if (off != -ENOSPC) - goto out_unlock; - } - } - - /* hmmm... no space left, create a new chunk */ - chunk = alloc_pcpu_chunk(); - if (!chunk) - goto out_unlock; - pcpu_chunk_relocate(chunk, -1); - pcpu_chunk_addr_insert(chunk); - - off = pcpu_alloc_area(chunk, size, align); - if (off < 0) - goto out_unlock; - -area_found: - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - pcpu_free_area(chunk, off); - goto out_unlock; - } - - ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); -out_unlock: - mutex_unlock(&pcpu_mutex); - return ptr; -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -static void pcpu_kill_chunk(struct pcpu_chunk *chunk) -{ - WARN_ON(chunk->immutable); - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); - list_del(&chunk->list); - rb_erase(&chunk->rb_node, &pcpu_addr_root); - free_pcpu_chunk(chunk); -} - -/** - * free_percpu - free percpu area - * @ptr: pointer to area to free - * - * Free percpu area @ptr. Might sleep. - */ -void free_percpu(void *ptr) -{ - void *addr = __pcpu_ptr_to_addr(ptr); - struct pcpu_chunk *chunk; - int off; - - if (!ptr) - return; - - mutex_lock(&pcpu_mutex); - - chunk = pcpu_chunk_addr_search(addr); - off = addr - chunk->vm->addr; - - pcpu_free_area(chunk, off); - - /* the chunk became fully free, kill one if there are other free ones */ - if (chunk->free_size == pcpu_unit_size) { - struct pcpu_chunk *pos; - - list_for_each_entry(pos, - &pcpu_slot[pcpu_chunk_slot(chunk)], list) - if (pos != chunk) { - pcpu_kill_chunk(pos); - break; - } - } - - mutex_unlock(&pcpu_mutex); -} -EXPORT_SYMBOL_GPL(free_percpu); - -/** - * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer - * @static_size: the size of static percpu area in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto - * @free_size: free size in bytes, 0 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary - * - * Initialize the first percpu chunk which contains the kernel static - * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. - * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. - * - * @unit_size, if non-zero, determines unit size and must be aligned - * to PAGE_SIZE and equal to or larger than @static_size + @free_size. - * - * @free_size determines the number of free bytes after the static - * area in the first chunk. If zero, whatever left is available. - * Specifying non-zero value make percpu leave the area after - * @static_size + @free_size alone. - * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. - * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. - * - * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. - */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t unit_size, - size_t free_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) -{ - static struct vm_struct static_vm; - struct pcpu_chunk *static_chunk; - unsigned int cpu; - int nr_pages; - int err, i; - - /* santiy checks */ - BUG_ON(!static_size); - BUG_ON(!unit_size && free_size); - BUG_ON(unit_size && unit_size < static_size + free_size); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(base_addr && !unit_size); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(static_size)); - - pcpu_static_size = static_size; - pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); - - /* - * Allocate chunk slots. The additional last slot is for - * empty chunks. - */ - pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); - for (i = 0; i < pcpu_nr_slots; i++) - INIT_LIST_HEAD(&pcpu_slot[i]); - - /* init static_chunk */ - static_chunk = alloc_bootmem(pcpu_chunk_struct_size); - INIT_LIST_HEAD(&static_chunk->list); - static_chunk->vm = &static_vm; - - if (free_size) - static_chunk->free_size = free_size; - else - static_chunk->free_size = pcpu_unit_size - pcpu_static_size; - - static_chunk->contig_hint = static_chunk->free_size; - - /* allocate vm address */ - static_vm.flags = VM_ALLOC; - static_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&static_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the static chunk can't - * be mapped or unmapped by percpu and is marked - * immutable. - */ - static_vm.addr = base_addr; - static_chunk->immutable = true; - } - - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(static_chunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(pcpu_static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(static_chunk, - cpu, i)); - - err = pcpu_map(static_chunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - - /* link static_chunk in */ - pcpu_chunk_relocate(static_chunk, -1); - pcpu_chunk_addr_insert(static_chunk); - - /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); - return pcpu_unit_size; -} diff --git a/trunk/mm/vmalloc.c b/trunk/mm/vmalloc.c index af58324c361a..11a929872ebd 100644 --- a/trunk/mm/vmalloc.c +++ b/trunk/mm/vmalloc.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -153,8 +152,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; @@ -170,22 +169,13 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); + flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr; } -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) -{ - int ret; - - ret = vmap_page_range_noflush(start, end, prot, pages); - flush_cache_vmap(start, end); - return ret; -} - static inline int is_vmalloc_or_module_addr(const void *x) { /* @@ -1000,32 +990,6 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); -/** - * vm_area_register_early - register vmap area early during boot - * @vm: vm_struct to register - * @align: requested alignment - * - * This function is used to register kernel vm area before - * vmalloc_init() is called. @vm->size and @vm->flags should contain - * proper values on entry and other fields should be zero. On return, - * vm->addr contains the allocated address. - * - * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. - */ -void __init vm_area_register_early(struct vm_struct *vm, size_t align) -{ - static size_t vm_init_off __initdata; - unsigned long addr; - - addr = ALIGN(VMALLOC_START + vm_init_off, align); - vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; - - vm->addr = (void *)addr; - - vm->next = vmlist; - vmlist = vm; -} - void __init vmalloc_init(void) { struct vmap_area *va; @@ -1053,58 +1017,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} - -/** - * unmap_kernel_range - unmap kernel VM area and flush cache and TLB - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Similar to unmap_kernel_range_noflush() but flushes vcache before - * the unmapping and tlb after. - */ void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; diff --git a/trunk/net/ipv4/af_inet.c b/trunk/net/ipv4/af_inet.c index 3a3dad801354..743f5542d65a 100644 --- a/trunk/net/ipv4/af_inet.c +++ b/trunk/net/ipv4/af_inet.c @@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field); int snmp_mib_init(void *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); + ptr[0] = __alloc_percpu(mibsize); if (!ptr[0]) goto err0; - ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); + ptr[1] = __alloc_percpu(mibsize); if (!ptr[1]) goto err1; return 0; diff --git a/trunk/net/ipv4/route.c b/trunk/net/ipv4/route.c index bf895401218f..97f71153584f 100644 --- a/trunk/net/ipv4/route.c +++ b/trunk/net/ipv4/route.c @@ -3376,7 +3376,7 @@ int __init ip_rt_init(void) int rc = 0; #ifdef CONFIG_NET_CLS_ROUTE - ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif