From a5d90c923bcfb9632d998ed06e9569216ad695f3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 4 Mar 2014 17:02:17 +0100 Subject: [PATCH 1/4] x86/efi: Quirk out SGI UV Alex reported hitting the following BUG after the EFI 1:1 virtual mapping work was merged, kernel BUG at arch/x86/mm/init_64.c:351! invalid opcode: 0000 [#1] SMP Call Trace: [] init_extra_mapping_uc+0x13/0x15 [] uv_system_init+0x22b/0x124b [] ? clockevents_register_device+0x138/0x13d [] ? setup_APIC_timer+0xc5/0xc7 [] ? clockevent_delta2ns+0xb/0xd [] ? setup_boot_APIC_clock+0x4a8/0x4b7 [] ? printk+0x72/0x74 [] native_smp_prepare_cpus+0x389/0x3d6 [] kernel_init_freeable+0xb7/0x1fb [] ? rest_init+0x74/0x74 [] kernel_init+0x9/0xff [] ret_from_fork+0x7c/0xb0 [] ? rest_init+0x74/0x74 Getting this thing to work with the new mapping scheme would need more work, so automatically switch to the old memmap layout for SGI UV. Acked-by: Russ Anderson Cc: Alex Thorlton Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/kernel/setup.c | 10 ++-------- arch/x86/platform/efi/efi.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3d6b9f81cc68..acd86c850414 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -134,6 +134,7 @@ extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); +extern void __init efi_apply_memmap_quirks(void); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 06853e670354..ce72964b2f46 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1239,14 +1239,8 @@ void __init setup_arch(char **cmdline_p) register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI - /* Once setup is done above, unmap the EFI memory map on - * mismatched firmware/kernel archtectures since there is no - * support for runtime services. - */ - if (efi_enabled(EFI_BOOT) && !efi_is_native()) { - pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); - } + if (efi_enabled(EFI_BOOT)) + efi_apply_memmap_quirks(); #endif } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1a201ac7cef8..b97acecf3fd9 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -52,6 +52,7 @@ #include #include #include +#include #define EFI_DEBUG @@ -1210,3 +1211,22 @@ static int __init parse_efi_cmdline(char *str) return 0; } early_param("efi", parse_efi_cmdline); + +void __init efi_apply_memmap_quirks(void) +{ + /* + * Once setup is done earlier, unmap the EFI memory map on mismatched + * firmware/kernel architectures since there is no support for runtime + * services. + */ + if (!efi_is_native()) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + } + + /* + * UV doesn't support the new EFI pagetable mapping yet. + */ + if (is_uv_system()) + set_bit(EFI_OLD_MEMMAP, &x86_efi_facility); +} From 0ac09f9f8cd1fb028a48330edba6023d347d3cea Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 28 Feb 2014 17:05:26 +0100 Subject: [PATCH 2/4] x86, trace: Fix CR2 corruption when tracing page faults The trace_do_page_fault function trigger tracepoint and then handles the actual page fault. This could lead to error if the tracepoint caused page fault. The original cr2 value gets lost and the original page fault handler kills current process with SIGSEGV. This happens if you record page faults with callchain data, the user part of it will cause tracepoint handler to page fault: # perf record -g -e exceptions:page_fault_user ls Fixing this by saving the original cr2 value and using it after tracepoint handler is done. v2: Moving the cr2 read before exception_enter, because it could trigger tracepoint as well. Reported-by: Arnaldo Carvalho de Melo Reported-by: Vince Weaver Tested-by: Vince Weaver Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Seiji Aguchi Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1402211701380.6395@vincent-weaver-1.um.maine.edu Link: http://lkml.kernel.org/r/20140228160526.GD1133@krava.brq.redhat.com --- arch/x86/mm/fault.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6dea040cc3a1..e7fa28bf3262 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1022,11 +1022,11 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * routines. */ static void __kprobes -__do_page_fault(struct pt_regs *regs, unsigned long error_code) +__do_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; - unsigned long address; struct mm_struct *mm; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -1034,9 +1034,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) tsk = current; mm = tsk->mm; - /* Get the faulting address: */ - address = read_cr2(); - /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. @@ -1252,9 +1249,11 @@ dotraplinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { enum ctx_state prev_state; + /* Get the faulting address: */ + unsigned long address = read_cr2(); prev_state = exception_enter(); - __do_page_fault(regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } @@ -1271,9 +1270,16 @@ dotraplinkage void __kprobes trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) { enum ctx_state prev_state; + /* + * The exception_enter and tracepoint processing could + * trigger another page faults (user space callchain + * reading) and destroy the original cr2 value, so read + * the faulting address now. + */ + unsigned long address = read_cr2(); prev_state = exception_enter(); trace_page_fault_entries(regs, error_code); - __do_page_fault(regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } From d4078e232267ff53f3b030b9698a3c001db4dbec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Mar 2014 14:07:49 +0100 Subject: [PATCH 3/4] x86, trace: Further robustify CR2 handling vs tracing Building on commit 0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults") this patch addresses another few issues: - Now that read_cr2() is lifted into trace_do_page_fault(), we should pass the address to trace_page_fault_entries() to avoid it re-reading a potentially changed cr2. - Put both trace_do_page_fault() and trace_page_fault_entries() under CONFIG_TRACING. - Mark both fault entry functions {,trace_}do_page_fault() as notrace to avoid getting __mcount or other function entry trace callbacks before we've observed CR2. - Mark __do_page_fault() as noinline to guarantee the function tracer does get to see the fault. Cc: Cc: Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140306145300.GO9987@twins.programming.kicks-ass.net Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e7fa28bf3262..a10c8c792161 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1020,8 +1020,12 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. + * + * This function must have noinline because both callers + * {,trace_}do_page_fault() have notrace on. Having this an actual function + * guarantees there's a function trace entry. */ -static void __kprobes +static void __kprobes noinline __do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { @@ -1245,31 +1249,38 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, up_read(&mm->mmap_sem); } -dotraplinkage void __kprobes +dotraplinkage void __kprobes notrace do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; - /* Get the faulting address: */ - unsigned long address = read_cr2(); + + /* + * We must have this function tagged with __kprobes, notrace and call + * read_cr2() before calling anything else. To avoid calling any kind + * of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contain all sorts of tracepoints. + */ prev_state = exception_enter(); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -static void trace_page_fault_entries(struct pt_regs *regs, +#ifdef CONFIG_TRACING +static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) { if (user_mode(regs)) - trace_page_fault_user(read_cr2(), regs, error_code); + trace_page_fault_user(address, regs, error_code); else - trace_page_fault_kernel(read_cr2(), regs, error_code); + trace_page_fault_kernel(address, regs, error_code); } -dotraplinkage void __kprobes +dotraplinkage void __kprobes notrace trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) { - enum ctx_state prev_state; /* * The exception_enter and tracepoint processing could * trigger another page faults (user space callchain @@ -1277,9 +1288,11 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) * the faulting address now. */ unsigned long address = read_cr2(); + enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(regs, error_code); + trace_page_fault_entries(address, regs, error_code); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } +#endif /* CONFIG_TRACING */ From 5fa10196bdb5f190f595ebd048490ee52dddea0f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 7 Mar 2014 15:05:20 -0800 Subject: [PATCH 4/4] x86: Ignore NMIs that come in during early boot Don Zickus reports: A customer generated an external NMI using their iLO to test kdump worked. Unfortunately, the machine hung. Disabling the nmi_watchdog made things work. I speculated the external NMI fired, caused the machine to panic (as expected) and the perf NMI from the watchdog came in and was latched. My guess was this somehow caused the hang. ---- It appears that the latched NMI stays latched until the early page table generation on 64 bits, which causes exceptions to happen which end in IRET, which re-enable NMI. Therefore, ignore NMIs that come in during early execution, until we have proper exception handling. Reported-and-tested-by: Don Zickus Link: http://lkml.kernel.org/r/1394221143-29713-1-git-send-email-dzickus@redhat.com Signed-off-by: H. Peter Anvin Cc: # v3.5+, older with some backport effort --- arch/x86/kernel/head_32.S | 7 ++++++- arch/x86/kernel/head_64.S | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 81ba27679f18..d2a21590794a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -544,6 +544,10 @@ ENDPROC(early_idt_handlers) /* This is global to keep gas from relaxing the jumps */ ENTRY(early_idt_handler) cld + + cmpl $X86_TRAP_NMI,(%esp) + je is_nmi # Ignore NMI + cmpl $2,%ss:early_recursion_flag je hlt_loop incl %ss:early_recursion_flag @@ -594,8 +598,9 @@ ex_entry: pop %edx pop %ecx pop %eax - addl $8,%esp /* drop vector number and error code */ decl %ss:early_recursion_flag +is_nmi: + addl $8,%esp /* drop vector number and error code */ iret ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e1aabdb314c8..33f36c78594e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -343,6 +343,9 @@ early_idt_handlers: ENTRY(early_idt_handler) cld + cmpl $X86_TRAP_NMI,(%rsp) + je is_nmi # Ignore NMI + cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) @@ -405,8 +408,9 @@ ENTRY(early_idt_handler) popq %rdx popq %rcx popq %rax - addq $16,%rsp # drop vector number and error code decl early_recursion_flag(%rip) +is_nmi: + addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN ENDPROC(early_idt_handler)