Skip to content

Commit

Permalink
kaiser: merged update
Browse files Browse the repository at this point in the history
Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging).

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  • Loading branch information
Dave Hansen authored and Greg Kroah-Hartman committed Jan 5, 2018
1 parent 13be448 commit 8f0baad
Show file tree
Hide file tree
Showing 15 changed files with 549 additions and 188 deletions.
105 changes: 92 additions & 13 deletions arch/x86/entry/entry_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
Expand Down Expand Up @@ -326,11 +333,25 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64

opportunistic_sysret_failed:
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
Expand Down Expand Up @@ -1087,6 +1108,13 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
/*
* error_entry() always returns with a kernel gsbase and
* CR3. We must also have a kernel CR3/gsbase before
* calling TRACE_IRQS_*. Just unconditionally switch to
* the kernel CR3 here.
*/
SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
Expand All @@ -1096,7 +1124,6 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
SWITCH_KERNEL_CR3

.Lerror_entry_from_usermode_after_swapgs:
/*
Expand Down Expand Up @@ -1148,7 +1175,6 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
SWITCH_KERNEL_CR3

/*
* Pretend that the exception came from user mode: set up pt_regs
Expand Down Expand Up @@ -1249,7 +1275,10 @@ ENTRY(nmi)
*/

SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
/*
* percpu variables are mapped with user CR3, so no need
* to switch CR3 here.
*/
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
Expand Down Expand Up @@ -1283,14 +1312,33 @@ ENTRY(nmi)

movq %rsp, %rdi
movq $-1, %rsi
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), %rax
#endif
movq %rax, %cr3
#endif
call do_nmi
/*
* Unconditionally restore CR3. I know we return to
* kernel code that needs user CR3, but do we ever return
* to "user mode" where we need the kernel CR3?
*/
#ifdef CONFIG_KAISER
popq %rax
mov %rax, %cr3
#endif

/*
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
* work, because we don't want to enable interrupts. Do not
* switch to user CR3: we might be going back to kernel code
* that had a user CR3 set.
*/
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret

Expand Down Expand Up @@ -1486,23 +1534,54 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK

/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
* Use the same approach as paranoid_entry to handle SWAPGS, but
* without CR3 handling since we do that differently in NMIs. No
* need to use paranoid_exit as we should not be calling schedule
* in NMI context. Even with normal interrupts enabled. An NMI
* should not be setting NEED_RESCHED or anything that normal
* interrupts and exceptions might do.
*/
call paranoid_entry
cld
SAVE_C_REGS
SAVE_EXTRA_REGS
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1:
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), %rax
#endif
movq %rax, %cr3
#endif

/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
movq $-1, %rsi
call do_nmi
/*
* Unconditionally restore CR3. We might be returning to
* kernel code that needs user CR3, like just just before
* a sysret.
*/
#ifdef CONFIG_KAISER
popq %rax
mov %rax, %cr3
#endif

testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
SWITCH_USER_CR3_NO_STACK
/* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
Expand Down
43 changes: 26 additions & 17 deletions arch/x86/include/asm/kaiser.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,17 @@

.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), \reg
#endif
movq \reg, %cr3
.endm

.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
orq $(0x1000), \reg
#endif
movq \reg, %cr3
.endm

Expand Down Expand Up @@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm

#endif /* CONFIG_KAISER */

#else /* __ASSEMBLY__ */


#ifdef CONFIG_KAISER
// Upon kernel/user mode switch, it may happen that
// the address space has to be switched before the registers have been stored.
// To change the address space, another register is needed.
// A register therefore has to be stored/restored.
//
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* Upon kernel/user mode switch, it may happen that the address
* space has to be switched before the registers have been
* stored. To change the address space, another register is
* needed. A register therefore has to be stored/restored.
*/

#endif /* CONFIG_KAISER */
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

/**
* shadowmem_add_mapping - map a virtual memory part to the shadow mapping
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
* @size: the size of the range
* @flags: The mapping flags of the pages
*
* the mapping is done on a global scope, so no bigger synchronization has to be done.
* the pages have to be manually unmapped again when they are not needed any longer.
* The mapping is done on a global scope, so no bigger
* synchronization has to be done. the pages have to be
* manually unmapped again when they are not needed any longer.
*/
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);


/**
* shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range
* @size: the size of the range
*/
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

/**
* shadowmem_initialize_mapping - Initalize the shadow mapping
* kaiser_initialize_mapping - Initalize the shadow mapping
*
* most parts of the shadow mapping can be mapped upon boot time.
* only the thread stacks have to be mapped on runtime.
* the mapped regions are not unmapped at all.
* Most parts of the shadow mapping can be mapped upon boot
* time. Only per-process things like the thread stacks
* or a new LDT have to be mapped at runtime. These boot-
* time mappings are permanent and nevertunmapped.
*/
extern void kaiser_init(void);

#endif
#endif /* CONFIG_KAISER */

#endif /* __ASSEMBLY */



Expand Down
18 changes: 15 additions & 3 deletions arch/x86/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)

static inline int pgd_bad(pgd_t pgd)
{
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
pgdval_t ignore_flags = _PAGE_USER;
/*
* We set NX on KAISER pgds that map userspace memory so
* that userspace can not meaningfully use the kernel
* page table by accident; it will fault on the first
* instruction it tries to run. See native_set_pgd().
*/
if (IS_ENABLED(CONFIG_KAISER))
ignore_flags |= _PAGE_NX;

return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
Expand Down Expand Up @@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
// clone the shadow pgd part as well
memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
/* Clone the shadow pgd part as well */
memcpy(native_get_shadow_pgd(dst),
native_get_shadow_pgd(src),
count * sizeof(pgd_t));
#endif
}

Expand Down
48 changes: 40 additions & 8 deletions arch/x86/include/asm/pgtable_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
}

#ifdef CONFIG_KAISER
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
}

static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
}
#else
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
BUILD_BUG_ON(1);
return NULL;
}
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
return pgdp;
}
#endif /* CONFIG_KAISER */

/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
* This returns true for user pages that need to get copied into
* both the user and kernel copies of the page tables, and false
* for kernel pages that should only be in the kernel copy.
*/
static inline bool is_userspace_pgd(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;

return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_KAISER
// We know that a pgd is page aligned.
// Therefore the lower indices have to be mapped to user space.
// These pages are mapped to the shadow mapping.
if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
pteval_t extra_kern_pgd_flags = 0;
/* Do we need to also populate the shadow pgd? */
if (is_userspace_pgd(pgdp)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
/*
* Even if the entry is *mapping* userspace, ensure
* that userspace can not use it. This way, if we
* get out to userspace running on the kernel CR3,
* userspace will crash instead of running.
*/
extra_kern_pgd_flags = _PAGE_NX;
}

pgdp->pgd = pgd.pgd & ~_PAGE_USER;
pgdp->pgd = pgd.pgd;
pgdp->pgd |= extra_kern_pgd_flags;
#else /* CONFIG_KAISER */
*pgdp = pgd;
#endif
Expand Down
6 changes: 1 addition & 5 deletions arch/x86/include/asm/pgtable_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
#ifdef CONFIG_KAISER
#define _PAGE_GLOBAL (_AT(pteval_t, 0))
#else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
Expand Down Expand Up @@ -123,11 +123,7 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif

#ifdef CONFIG_KAISER
#define _PAGE_PROTNONE (_AT(pteval_t, 0))
#else
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#endif

#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
Expand Down
Loading

0 comments on commit 8f0baad

Please sign in to comment.