From 61f734d897708949a7d76df1960bcddbd5be8e86 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 20 Feb 2009 18:57:58 -0800 Subject: [PATCH] --- yaml --- r: 136942 b: refs/heads/master c: a47e3ec197f515e25c77805f02d26f9e86456f65 h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/arch/x86/ia32/ia32_signal.c | 18 - trunk/arch/x86/include/asm/processor.h | 9 +- trunk/arch/x86/kernel/ptrace.c | 2 +- trunk/arch/x86/mm/fault.c | 1090 +++++++++++------------- trunk/arch/x86/mm/pageattr.c | 7 - trunk/arch/x86/vdso/vma.c | 4 +- trunk/include/linux/kprobes.h | 22 +- trunk/include/linux/mmiotrace.h | 78 +- 9 files changed, 527 insertions(+), 705 deletions(-) diff --git a/[refs] b/[refs] index 4513e94371a2..71b1acdabd84 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: b319eed0aa0a6d710887350a3cb734c572aa64c4 +refs/heads/master: a47e3ec197f515e25c77805f02d26f9e86456f65 diff --git a/trunk/arch/x86/ia32/ia32_signal.c b/trunk/arch/x86/ia32/ia32_signal.c index dd77ac0cac46..adc63f81cb8e 100644 --- a/trunk/arch/x86/ia32/ia32_signal.c +++ b/trunk/arch/x86/ia32/ia32_signal.c @@ -33,8 +33,6 @@ #include #include -#define DEBUG_SIG 0 - #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) #define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ @@ -220,12 +218,6 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#if DEBUG_SIG - printk(KERN_DEBUG "SIG restore_sigcontext: " - "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", - sc, sc->err, sc->ip, sc->cs, sc->flags); -#endif - get_user_try { /* * Reload fs and gs if they have changed in the signal @@ -488,11 +480,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->cs = __USER32_CS; regs->ss = __USER32_DS; -#if DEBUG_SIG - printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->ip, frame->pretcode); -#endif - return 0; } @@ -574,10 +561,5 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER32_CS; regs->ss = __USER32_DS; -#if DEBUG_SIG - printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->ip, frame->pretcode); -#endif - return 0; } diff --git a/trunk/arch/x86/include/asm/processor.h b/trunk/arch/x86/include/asm/processor.h index c7a98f738210..72914d0315e9 100644 --- a/trunk/arch/x86/include/asm/processor.h +++ b/trunk/arch/x86/include/asm/processor.h @@ -861,7 +861,6 @@ static inline void spin_lock_prefetch(const void *x) * User space process size: 3GB (default). */ #define TASK_SIZE PAGE_OFFSET -#define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -921,7 +920,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); /* * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -930,12 +929,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); 0xc0000000 : 0xFFFFe000) #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) + IA32_PAGE_OFFSET : TASK_SIZE64) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) + IA32_PAGE_OFFSET : TASK_SIZE64) #define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE_MAX +#define STACK_TOP_MAX TASK_SIZE64 #define INIT_THREAD { \ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ diff --git a/trunk/arch/x86/kernel/ptrace.c b/trunk/arch/x86/kernel/ptrace.c index fb2159a5c817..d2f7cd5b2c83 100644 --- a/trunk/arch/x86/kernel/ptrace.c +++ b/trunk/arch/x86/kernel/ptrace.c @@ -268,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task) if (test_tsk_thread_flag(task, TIF_IA32)) return IA32_PAGE_OFFSET - 3; #endif - return TASK_SIZE_MAX - 7; + return TASK_SIZE64 - 7; } #endif /* CONFIG_X86_32 */ diff --git a/trunk/arch/x86/mm/fault.c b/trunk/arch/x86/mm/fault.c index a03b7279efa0..29644175490f 100644 --- a/trunk/arch/x86/mm/fault.c +++ b/trunk/arch/x86/mm/fault.c @@ -1,79 +1,74 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. - * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. */ -#include + +#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include /* For unblank_screen() */ #include #include -#include -#include +#include /* for max_low_pfn */ #include -#include -#include -#include -#include -#include #include +#include +#include #include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include #include +#include +#include +#include +#include +#include #include +#include #include -#include /* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch + * Page fault error code bits + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch */ -enum x86_pf_error_code { - - PF_PROT = 1 << 0, - PF_WRITE = 1 << 1, - PF_USER = 1 << 2, - PF_RSVD = 1 << 3, - PF_INSTR = 1 << 4, -}; +#define PF_PROT (1<<0) +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) -/* - * Returns 0 if mmiotrace is disabled, or if the fault is not - * handled by mmiotrace: - */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { +#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; +#endif return 0; } static inline int notify_page_fault(struct pt_regs *regs) { +#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode_vm(regs)) { + if (!user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -81,76 +76,29 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; +#else + return 0; +#endif } /* - * Prefetch quirks: - * - * 32-bit mode: + * X86_32 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. * - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * X86_64 + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. * - * 64-bit mode: - * - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. - * - * Opcode checker based on code by Richard Brunner. + * Opcode checker based on code by Richard Brunner */ -static inline int -check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, - unsigned char opcode, int *prefetch) -{ - unsigned char instr_hi = opcode & 0xf0; - unsigned char instr_lo = opcode & 0x0f; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - return ((instr_lo & 7) == 0x6); -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - return (!user_mode(regs)) || (regs->cs == __USER_CS); -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - return (instr_lo & 0xC) == 0x4; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - return !instr_lo || (instr_lo>>1) == 1; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - if (probe_kernel_address(instr, opcode)) - return 0; - - *prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - return 0; - default: - return 0; - } -} - -static int -is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +static int is_prefetch(struct pt_regs *regs, unsigned long error_code, + unsigned long addr) { - unsigned char *max_instr; unsigned char *instr; + int scan_more = 1; int prefetch = 0; + unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -159,170 +107,106 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) if (error_code & PF_INSTR) return 0; - instr = (void *)convert_ip_to_linear(current, regs); + instr = (unsigned char *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (instr < max_instr) { + while (scan_more && instr < max_instr) { unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; if (probe_kernel_address(instr, opcode)) break; + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; instr++; - if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + scan_more = ((instr_lo & 7) == 0x6); break; - } - return prefetch; -} - -static void -force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - - force_sig_info(si_signo, &info, tsk); -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } - - return pmd_k; -} - -void vmalloc_sync_all(void) -{ - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - - unsigned long flags; - struct page *page; +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + if (probe_kernel_address(instr, opcode)) break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; } - spin_unlock_irqrestore(&pgd_lock, flags); } + return prefetch; } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) +static void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) { - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; + siginfo_t info; - return 0; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) +#ifdef CONFIG_X86_64 +static int bad_address(void *p) { - unsigned long bit; - - if (!v8086_mode(regs)) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; + unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); } +#endif static void dump_pagetable(unsigned long address) { +#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; - #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -334,224 +218,123 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already: + * it's allocated already. */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { - page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } printk("\n"); -} - -#else /* CONFIG_X86_64: */ - -void vmalloc_sync_all(void) -{ - unsigned long address; - - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; +#else /* CONFIG_X86_64 */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + pgd = (pgd_t *)read_cr3(); - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto ret; pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -} - -static int bad_address(void *p) -{ - unsigned long dummy; - - return probe_kernel_address((unsigned long *)p, dummy); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +#endif } -static void dump_pagetable(unsigned long address) +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; - pgd += pgd_index(address); - if (bad_address(pgd)) - goto bad; + pgd += index; + pgd_k = init_mm.pgd + index; - printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd_k)) + return NULL; - if (!pgd_present(*pgd)) - goto out; + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ pud = pud_offset(pgd, address); - if (bad_address(pud)) - goto bad; - - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud) || pud_large(*pud)) - goto out; + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) - goto bad; - - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) - goto out; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) - goto bad; - - printk("PTE %lx", pte_val(*pte)); -out: - printk("\n"); - return; -bad: - printk("BAD\n"); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; } +#endif -#endif /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_64 +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif -/* - * Workaround for K8 erratum #93 & buggy BIOS. - * - * BIOS SMM functions are required to use a specific workaround - * to avoid corruption of the 64bit RIP register on C stepping K8. - * - * A lot of BIOS that didn't get tested properly miss this. - * - * The OS sees this as a page fault with the upper 32bits of RIP cleared. - * Try to work around it here. - * - * Note we only handle faults in kernel here. - * Does nothing on 32-bit. +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. + Does nothing for X86_32 */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int once; - + static int warned; if (address != regs->ip) return 0; - if ((address >> 32) != 0) return 0; - address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!once) { + if (!warned) { printk(errata93_warning); - once = 1; + warned = 1; } regs->ip = address; return 1; @@ -561,17 +344,16 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps - * to illegal addresses >4GB. - * - * We catch this in the page fault handler because these addresses - * are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal + * addresses >4GB. We catch this in the page fault handler because these + * addresses are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) return 1; #endif return 0; @@ -581,9 +363,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; - /* - * Pentium F0 0F C7 C8 bug workaround: + * Pentium F0 0F C7 C8 bug workaround. */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -597,87 +378,80 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static const char nx_warning[] = KERN_CRIT -"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; - -static void -show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { +#ifdef CONFIG_X86_32 if (!oops_may_print()) return; +#endif +#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; - pte_t *pte = lookup_address(address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) - printk(nx_warning, current_uid()); + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current_uid()); } +#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); - printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); - dump_pagetable(address); } -static noinline void -pgtable_bad(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +#ifdef CONFIG_X86_64 +static noinline void pgtable_bad(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { - struct task_struct *tsk; - unsigned long flags; - int sig; - - flags = oops_begin(); - tsk = current; - sig = SIGKILL; + unsigned long flags = oops_begin(); + int sig = SIGKILL; + struct task_struct *tsk = current; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) sig = 0; - oops_end(flags, regs, sig); } +#endif -static noinline void -no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static noinline void no_context(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; + +#ifdef CONFIG_X86_64 unsigned long flags; int sig; +#endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * 32-bit: - * - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. * - * 64-bit: - * - * Hall of shame of CPU/BIOS bugs. + * X86_64 + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -687,70 +461,54 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice: + * terminate things with extreme prejudice. */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else flags = oops_begin(); +#endif show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; - /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); +#endif } -/* - * Print out info about fatal segfaults, if the show_unhandled_signals - * sysctl is set: - */ -static inline void -show_signal_msg(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct task_struct *tsk) -{ - if (!unhandled_signal(tsk, SIGSEGV)) - return; - - if (!printk_ratelimit()) - return; - - printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *)regs->ip, (void *)regs->sp, error_code); - - print_vma_addr(KERN_CONT " in ", regs->ip); - - printk(KERN_CONT "\n"); -} - -static void -__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int si_code) +static void __bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here: + * It's possible to have interrupts off here. */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space: + * from user space. */ if (is_prefetch(regs, error_code, address)) return; @@ -758,16 +516,22 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; - if (unlikely(show_unhandled_signals)) - show_signal_msg(regs, error_code, address, tsk); - - /* Kernel addresses are always protection faults: */ - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; } @@ -777,16 +541,15 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, no_context(regs, error_code, address); } -static noinline void -bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static noinline void bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void -__bad_area(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int si_code) +static void __bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) { struct mm_struct *mm = current->mm; @@ -799,75 +562,67 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void -bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +static noinline void bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void -bad_area_access_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static noinline void bad_area_access_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void -out_of_memory(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void out_of_memory(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed): + * (which will retry the fault, or kill us if we got oom-killed). */ up_read(¤t->mm->mmap_sem); - pagefault_out_of_memory(); } -static void -do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) +static void do_sigbus(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die: */ + /* Kernel mode? Handle exceptions or die */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); - - /* User-space => ok to do another page fault: */ +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ if (is_prefetch(regs, error_code, address)) return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void -mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, unsigned int fault) +static noinline void mm_fault_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) { + if (fault & VM_FAULT_OOM) out_of_memory(regs, error_code, address); - } else { - if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); - } + else if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -875,25 +630,21 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. - * - * This allows us to lazily refresh the TLB when increasing the - * permissions of a kernel page (RO -> RW or NX -> X). Doing it - * eagerly is very expensive since that implies doing a full - * cross-processor TLB flush, even if no stale TLB entries exist - * on other processors. - * + * Handle a spurious fault caused by a stale TLB entry. This allows + * us to lazily refresh the TLB when increasing the permissions of a + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very + * expensive since that implies doing a full cross-processor TLB + * flush, even if no stale TLB entries exist on other processors. * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int -spurious_fault(unsigned long error_code, unsigned long address) +static noinline int spurious_fault(unsigned long error_code, + unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -921,46 +672,123 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pte_present(*pte)) return 0; - ret = spurious_fault_check(error_code, pte); - if (!ret) - return 0; + return spurious_fault_check(error_code, pte); +} + +/* + * X86_32 + * Handle a fault on the vmalloc or module mapping area + * + * X86_64 + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ +#ifdef CONFIG_X86_32 + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; /* - * Make sure we have permissions in PMD. - * If not, then there's a bug in the page tables: + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. */ - ret = spurious_fault_check(error_code, (pte_t *) pmd); - WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +#else + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; - return ret; + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +#endif } int show_unhandled_signals = 1; -static inline int -access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +static inline int access_error(unsigned long error_code, int write, + struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present: */ + /* write, present and write, not present */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - return 0; - } - - /* read, present: */ - if (unlikely(error_code & PF_PROT)) - return 1; - - /* read, not present: */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + } else if (unlikely(error_code & PF_PROT)) { + /* read, present */ return 1; + } else { + /* read, not present */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + } return 0; } static int fault_in_kernel_space(unsigned long address) { - return address >= TASK_SIZE_MAX; +#ifdef CONFIG_X86_32 + return address >= TASK_SIZE; +#else /* !CONFIG_X86_32 */ + return address >= TASK_SIZE64; +#endif /* CONFIG_X86_32 */ } /* @@ -968,22 +796,23 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { - struct vm_area_struct *vma; - struct task_struct *tsk; unsigned long address; + struct task_struct *tsk; struct mm_struct *mm; + struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; - prefetchw(&mm->mmap_sem); - /* Get the faulting address: */ + /* get the address */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -1007,23 +836,22 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB: */ + /* Can handle a stale RO->RW TLB */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults: */ + /* kprobes don't want to hook the spurious faults. */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock: + * fault we could otherwise deadlock. */ bad_area_nosemaphore(regs, error_code, address); - return; } - /* kprobes don't want to hook the spurious faults: */ + /* kprobes don't want to hook the spurious faults. */ if (unlikely(notify_page_fault(regs))) return; /* @@ -1031,22 +859,22 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: + * potential system fault or CPU buglet. */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); - } + } else if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); +#endif /* - * If we're in an interrupt, have no user context or are running - * in an atomic region then we must not take the fault: + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -1055,19 +883,19 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in - * the kernel and should generate an OOPS. Unfortunately, in the - * case of an erroneous fault occurring in a code path which already - * holds mmap_sem we will deadlock attempting to validate the fault - * against the address space. Luckily the kernel only validly - * references user space from well defined areas of code, which are - * listed in the exceptions table. + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a - * deadlock. Attempt to lock the address space, if we cannot we then - * validate the source. If this is invalid we can skip the address - * space check, thus avoiding the deadlock: + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -1078,9 +906,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): + * The above down_read_trylock() might have succeeded in which + * case we'll have missed the might_sleep() from down_read(). */ might_sleep(); } @@ -1100,7 +927,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535, $31" pushes + * and pusha to work. ("enter $65535,$31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -1119,7 +946,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; - if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -1128,21 +954,75 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. */ fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; - check_v8086_mode(regs, address, tsk); - +#ifdef CONFIG_X86_32 + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (v8086_mode(regs)) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } +#endif up_read(&mm->mmap_sem); } + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +void vmalloc_sync_all(void) +{ + unsigned long address; + +#ifdef CONFIG_X86_32 + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +#else /* CONFIG_X86_64 */ + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +#endif +} diff --git a/trunk/arch/x86/mm/pageattr.c b/trunk/arch/x86/mm/pageattr.c index 8253bc97587e..7be47d1a97e4 100644 --- a/trunk/arch/x86/mm/pageattr.c +++ b/trunk/arch/x86/mm/pageattr.c @@ -482,13 +482,6 @@ static int split_large_page(pte_t *kpte, unsigned long address) pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* - * If we ever want to utilize the PAT bit, we need to - * update this function to make sure it's converted from - * bit 12 to bit 7 when we cross from the 2MB level to - * the 4K level: - */ - WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { diff --git a/trunk/arch/x86/vdso/vma.c b/trunk/arch/x86/vdso/vma.c index 7133cdf9098b..9c98cc6ba978 100644 --- a/trunk/arch/x86/vdso/vma.c +++ b/trunk/arch/x86/vdso/vma.c @@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) unsigned long addr, end; unsigned offset; end = (start + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE_MAX) - end = TASK_SIZE_MAX; + if (end >= TASK_SIZE64) + end = TASK_SIZE64; end -= len; /* This loses some more bits than a modulo, but is cheaper */ offset = get_random_int() & (PTRS_PER_PTE - 1); diff --git a/trunk/include/linux/kprobes.h b/trunk/include/linux/kprobes.h index 2ec6cc14a114..32851eef48f0 100644 --- a/trunk/include/linux/kprobes.h +++ b/trunk/include/linux/kprobes.h @@ -182,14 +182,6 @@ struct kprobe_blackpoint { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -/* - * For #ifdef avoidance: - */ -static inline int kprobes_built_in(void) -{ - return 1; -} - #ifdef CONFIG_KRETPROBES extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -279,16 +271,8 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); -#else /* !CONFIG_KPROBES: */ +#else /* CONFIG_KPROBES */ -static inline int kprobes_built_in(void) -{ - return 0; -} -static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - return 0; -} static inline struct kprobe *get_kprobe(void *addr) { return NULL; @@ -345,5 +329,5 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } -#endif /* CONFIG_KPROBES */ -#endif /* _LINUX_KPROBES_H */ +#endif /* CONFIG_KPROBES */ +#endif /* _LINUX_KPROBES_H */ diff --git a/trunk/include/linux/mmiotrace.h b/trunk/include/linux/mmiotrace.h index 3d1b7bde1283..139d7c88d9c9 100644 --- a/trunk/include/linux/mmiotrace.h +++ b/trunk/include/linux/mmiotrace.h @@ -1,5 +1,5 @@ -#ifndef _LINUX_MMIOTRACE_H -#define _LINUX_MMIOTRACE_H +#ifndef MMIOTRACE_H +#define MMIOTRACE_H #include #include @@ -13,34 +13,28 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, unsigned long condition, struct pt_regs *); struct kmmio_probe { - /* kmmio internal list: */ - struct list_head list; - /* start location of the probe point: */ - unsigned long addr; - /* length of the probe region: */ - unsigned long len; - /* Called before addr is executed: */ - kmmio_pre_handler_t pre_handler; - /* Called after addr is executed: */ - kmmio_post_handler_t post_handler; - void *private; + struct list_head list; /* kmmio internal list */ + unsigned long addr; /* start location of the probe point */ + unsigned long len; /* length of the probe region */ + kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ + kmmio_post_handler_t post_handler; /* Called after addr is executed */ + void *private; }; -extern unsigned int kmmio_count; - -extern int register_kmmio_probe(struct kmmio_probe *p); -extern void unregister_kmmio_probe(struct kmmio_probe *p); - -#ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ static inline int is_kmmio_active(void) { + extern unsigned int kmmio_count; return kmmio_count; } +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); +#ifdef CONFIG_MMIOTRACE /* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); @@ -49,17 +43,7 @@ extern void mmiotrace_iounmap(volatile void __iomem *addr); /* For anyone to insert markers. Remember trailing newline. */ extern int mmiotrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); -#else /* !CONFIG_MMIOTRACE: */ -static inline int is_kmmio_active(void) -{ - return 0; -} - -static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr) -{ - return 0; -} - +#else static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) { @@ -79,28 +63,28 @@ static inline int mmiotrace_printk(const char *fmt, ...) #endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { - MMIO_READ = 0x1, /* struct mmiotrace_rw */ - MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ - MMIO_PROBE = 0x3, /* struct mmiotrace_map */ - MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { - resource_size_t phys; /* PCI address of register */ - unsigned long value; - unsigned long pc; /* optional program counter */ - int map_id; - unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ - unsigned char width; /* size of register access in bytes */ + resource_size_t phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ }; struct mmiotrace_map { - resource_size_t phys; /* base address in PCI space */ - unsigned long virt; /* base virtual address */ - unsigned long len; /* mapping size */ - int map_id; - unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ + resource_size_t phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ }; /* in kernel/trace/trace_mmiotrace.c */ @@ -110,4 +94,4 @@ extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); extern int mmio_trace_printk(const char *fmt, va_list args); -#endif /* _LINUX_MMIOTRACE_H */ +#endif /* MMIOTRACE_H */