kaiser: merged update

Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging). Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
mariux64 · Jan 5, 2018 · 8f0baad · 8f0baad
1 parent 13be448
commit 8f0baad
Show file tree

Hide file tree

Showing 15 changed files with 549 additions and 188 deletions.
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
@@ -326,11 +333,25 @@ return_from_SYSCALL_64:
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
@@ -1087,6 +1108,13 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	/*
+	 * error_entry() always returns with a kernel gsbase and
+	 * CR3.  We must also have a kernel CR3/gsbase before
+	 * calling TRACE_IRQS_*.  Just unconditionally switch to
+	 * the kernel CR3 here.
+	 */
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1096,7 +1124,6 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1148,7 +1175,6 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1249,7 +1275,10 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
-	SWITCH_KERNEL_CR3_NO_STACK
+	/*
+	 * percpu variables are mapped with user CR3, so no need
+	 * to switch CR3 here.
+	 */
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1283,14 +1312,33 @@ ENTRY(nmi)
 
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  I know we return to
+	 * kernel code that needs user CR3, but do we ever return
+	 * to "user mode" where we need the kernel CR3?
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	/*
 	 * Return back to user mode.  We must *not* do the normal exit
-	 * work, because we don't want to enable interrupts.  Fortunately,
-	 * do_nmi doesn't modify pt_regs.
+	 * work, because we don't want to enable interrupts.  Do not
+	 * switch to user CR3: we might be going back to kernel code
+	 * that had a user CR3 set.
 	 */
-	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1486,23 +1534,54 @@ end_repeat_nmi:
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-	 * as we should not be calling schedule in NMI context.
-	 * Even with normal interrupts enabled. An NMI should not be
-	 * setting NEED_RESCHED or anything that normal interrupts and
-	 * exceptions might do.
+	 * Use the same approach as paranoid_entry to handle SWAPGS, but
+	 * without CR3 handling since we do that differently in NMIs.  No
+	 * need to use paranoid_exit as we should not be calling schedule
+	 * in NMI context.  Even with normal interrupts enabled. An NMI
+	 * should not be setting NEED_RESCHED or anything that normal
+	 * interrupts and exceptions might do.
 	 */
-	call	paranoid_entry
+	cld
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
+	SWAPGS
+	xorl	%ebx, %ebx
+1:
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq	%rsp, %rdi
+	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
 	movq	$-1, %rsi
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  We might be returning to
+	 * kernel code that needs user CR3, like just just before
+	 * a sysret.
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
-	SWITCH_USER_CR3_NO_STACK
+	/* We fixed up CR3 above, so no need to switch it here */
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
@@ -16,13 +16,17 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 andq $(~0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 orq $(0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
 #endif /* CONFIG_KAISER */
+
 #else /* __ASSEMBLY__ */
 
 
 #ifdef CONFIG_KAISER
-// Upon kernel/user mode switch, it may happen that
-// the address space has to be switched before the registers have been stored.
-// To change the address space, another register is needed.
-// A register therefore has to be stored/restored.
-//
-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
 
-#endif /* CONFIG_KAISER */
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
 /**
- *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  *  @flags: The mapping flags of the pages
  *
- *  the mapping is done on a global scope, so no bigger synchronization has to be done.
- *  the pages have to be manually unmapped again when they are not needed any longer.
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
  */
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 
 
 /**
- *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  */
 extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 
 /**
- *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_initialize_mapping - Initalize the shadow mapping
  *
- *  most parts of the shadow mapping can be mapped upon boot time.
- *  only the thread stacks have to be mapped on runtime.
- *  the mapped regions are not unmapped at all.
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and nevertunmapped.
  */
 extern void kaiser_init(void);
 
-#endif
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
 
 
 

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	pgdval_t ignore_flags = _PAGE_USER;
+	/*
+	 * We set NX on KAISER pgds that map userspace memory so
+	 * that userspace can not meaningfully use the kernel
+	 * page table by accident; it will fault on the first
+	 * instruction it tries to run.  See native_set_pgd().
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	// clone the shadow pgd part as well
-	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+	/* Clone the shadow pgd part as well */
+	memcpy(native_get_shadow_pgd(dst),
+	       native_get_shadow_pgd(src),
+	       count * sizeof(pgd_t));
 #endif
 }
 

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
 }
 
 #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
 }
 
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
 }
+#else
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
+	BUILD_BUG_ON(1);
+	return NULL;
+}
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
+	return pgdp;
+}
 #endif /* CONFIG_KAISER */
 
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 #ifdef CONFIG_KAISER
-	// We know that a pgd is page aligned.
-	// Therefore the lower indices have to be mapped to user space.
-	// These pages are mapped to the shadow mapping.
-	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+	pteval_t extra_kern_pgd_flags = 0;
+	/* Do we need to also populate the shadow pgd? */
+	if (is_userspace_pgd(pgdp)) {
 		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+		/*
+		 * Even if the entry is *mapping* userspace, ensure
+		 * that userspace can not use it.  This way, if we
+		 * get out to userspace running on the kernel CR3,
+		 * userspace will crash instead of running.
+		 */
+		extra_kern_pgd_flags = _PAGE_NX;
 	}
-
-	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+	pgdp->pgd = pgd.pgd;
+	pgdp->pgd |= extra_kern_pgd_flags;
 #else /* CONFIG_KAISER */
 	*pgdp = pgd;
 #endif

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
@@ -48,7 +48,7 @@
 #ifdef CONFIG_KAISER
 #define _PAGE_GLOBAL	(_AT(pteval_t, 0))
 #else
-#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
@@ -123,11 +123,7 @@
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
 #endif
 
-#ifdef CONFIG_KAISER
-#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
-#else
 #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)