Skip to content

Commit

Permalink
KAISER: Kernel Address Isolation
Browse files Browse the repository at this point in the history
This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.

More information about the patch can be found on:

        https://github.com/IAIK/KAISER

From: Richard Fellner <richard.fellner@student.tugraz.at>
From: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
Date: Thu, 4 May 2017 14:26:50 +0200
Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5

To: <linux-kernel@vger.kernel.org>
To: <kernel-hardening@lists.openwall.com>
Cc: <clementine.maurice@iaik.tugraz.at>
Cc: <moritz.lipp@iaik.tugraz.at>
Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Cc: Richard Fellner <richard.fellner@student.tugraz.at>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <kirill.shutemov@linux.intel.com>
Cc: <anders.fogh@gdata-adan.de>

After several recent works [1,2,3] KASLR on x86_64 was basically
considered dead by many researchers. We have been working on an
efficient but effective fix for this problem and found that not mapping
the kernel space when running in user mode is the solution to this
problem [4] (the corresponding paper [5] will be presented at ESSoS17).

With this RFC patch we allow anybody to configure their kernel with the
flag CONFIG_KAISER to add our defense mechanism.

If there are any questions we would love to answer them.
We also appreciate any comments!

Cheers,
Daniel (+ the KAISER team from Graz University of Technology)

[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
[2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
[3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
[4] https://github.com/IAIK/KAISER
[5] https://gruss.cc/files/kaiser.pdf

[patch based also on
https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch]

Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  • Loading branch information
Richard Fellner authored and Greg Kroah-Hartman committed Jan 5, 2018
1 parent b5fd58e commit 13be448
Show file tree
Hide file tree
Showing 22 changed files with 449 additions and 16 deletions.
17 changes: 17 additions & 0 deletions arch/x86/entry/entry_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/kaiser.h>
#include <linux/err.h>

/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
Expand Down Expand Up @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
Expand Down Expand Up @@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64

Expand Down Expand Up @@ -323,10 +326,12 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64

opportunistic_sysret_failed:
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
Expand Down Expand Up @@ -424,6 +429,7 @@ ENTRY(ret_from_fork)
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret

Expand Down Expand Up @@ -478,6 +484,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
SWITCH_KERNEL_CR3

/*
* We need to tell lockdep that IRQs are off. We can't do this until
Expand Down Expand Up @@ -535,6 +542,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret

Expand Down Expand Up @@ -612,6 +620,7 @@ native_irq_return_ldt:

pushq %rdi /* Stash user RDI */
SWAPGS
SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
Expand All @@ -638,6 +647,7 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp

Expand Down Expand Up @@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
SWITCH_KERNEL_CR3
xorl %ebx, %ebx
1: ret
END(paranoid_entry)
Expand All @@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
Expand Down Expand Up @@ -1084,6 +1096,7 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
SWITCH_KERNEL_CR3

.Lerror_entry_from_usermode_after_swapgs:
/*
Expand Down Expand Up @@ -1135,6 +1148,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
SWITCH_KERNEL_CR3

/*
* Pretend that the exception came from user mode: set up pt_regs
Expand Down Expand Up @@ -1235,6 +1249,7 @@ ENTRY(nmi)
*/

SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
Expand Down Expand Up @@ -1275,6 +1290,7 @@ ENTRY(nmi)
* work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
*/
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret

Expand Down Expand Up @@ -1486,6 +1502,7 @@ end_repeat_nmi:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
Expand Down
7 changes: 6 additions & 1 deletion arch/x86/entry/entry_64_compat.S
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>

Expand Down Expand Up @@ -48,6 +49,7 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

/*
Expand Down Expand Up @@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK

/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
Expand Down Expand Up @@ -259,6 +262,7 @@ sysret32_from_system_call:
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
SWITCH_USER_CR3
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
Expand Down Expand Up @@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS

SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
* the saved RAX contains a 32-bit number when we're invoking a 32-bit
Expand Down Expand Up @@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)

/* Go back to user mode. */
TRACE_IRQS_ON
SWITCH_USER_CR3_NO_STACK
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/hw_irq.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)

typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);

#endif /* !ASSEMBLY_ */

Expand Down
113 changes: 113 additions & 0 deletions arch/x86/include/asm/kaiser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#ifndef _ASM_X86_KAISER_H
#define _ASM_X86_KAISER_H

/* This file includes the definitions for the KAISER feature.
* KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
* It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
* but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
* the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
* By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
*
* A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
* of the user space, or the stacks.
*/
#ifdef __ASSEMBLY__
#ifdef CONFIG_KAISER

.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
andq $(~0x1000), \reg
movq \reg, %cr3
.endm

.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
orq $(0x1000), \reg
movq \reg, %cr3
.endm

.macro SWITCH_KERNEL_CR3
pushq %rax
_SWITCH_TO_KERNEL_CR3 %rax
popq %rax
.endm

.macro SWITCH_USER_CR3
pushq %rax
_SWITCH_TO_USER_CR3 %rax
popq %rax
.endm

.macro SWITCH_KERNEL_CR3_NO_STACK
movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
_SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm


.macro SWITCH_USER_CR3_NO_STACK

movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
_SWITCH_TO_USER_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax

.endm

#else /* CONFIG_KAISER */

.macro SWITCH_KERNEL_CR3 reg
.endm
.macro SWITCH_USER_CR3 reg
.endm
.macro SWITCH_USER_CR3_NO_STACK
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
.endm

#endif /* CONFIG_KAISER */
#else /* __ASSEMBLY__ */


#ifdef CONFIG_KAISER
// Upon kernel/user mode switch, it may happen that
// the address space has to be switched before the registers have been stored.
// To change the address space, another register is needed.
// A register therefore has to be stored/restored.
//
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

#endif /* CONFIG_KAISER */

/**
* shadowmem_add_mapping - map a virtual memory part to the shadow mapping
* @addr: the start address of the range
* @size: the size of the range
* @flags: The mapping flags of the pages
*
* the mapping is done on a global scope, so no bigger synchronization has to be done.
* the pages have to be manually unmapped again when they are not needed any longer.
*/
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);


/**
* shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range
* @size: the size of the range
*/
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

/**
* shadowmem_initialize_mapping - Initalize the shadow mapping
*
* most parts of the shadow mapping can be mapped upon boot time.
* only the thread stacks have to be mapped on runtime.
* the mapped regions are not unmapped at all.
*/
extern void kaiser_init(void);

#endif



#endif /* _ASM_X86_KAISER_H */
4 changes: 4 additions & 0 deletions arch/x86/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
// clone the shadow pgd part as well
memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
Expand Down
21 changes: 21 additions & 0 deletions arch/x86/include/asm/pgtable_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}

#ifdef CONFIG_KAISER
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
}

static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
}
#endif /* CONFIG_KAISER */

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_KAISER
// We know that a pgd is page aligned.
// Therefore the lower indices have to be mapped to user space.
// These pages are mapped to the shadow mapping.
if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
}

pgdp->pgd = pgd.pgd & ~_PAGE_USER;
#else /* CONFIG_KAISER */
*pgdp = pgd;
#endif
}

static inline void native_pgd_clear(pgd_t *pgd)
Expand Down
12 changes: 10 additions & 2 deletions arch/x86/include/asm/pgtable_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#ifdef CONFIG_KAISER
#define _PAGE_GLOBAL (_AT(pteval_t, 0))
#else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
Expand Down Expand Up @@ -119,7 +123,11 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif

#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#ifdef CONFIG_KAISER
#define _PAGE_PROTNONE (_AT(pteval_t, 0))
#else
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#endif

#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
Expand Down
7 changes: 6 additions & 1 deletion arch/x86/include/asm/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ struct tss_struct {

} ____cacheline_aligned;

DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);

#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
Expand All @@ -335,6 +335,11 @@ union irq_stack_union {
char gs_base[40];
unsigned long stack_canary;
};

struct {
char irq_stack_pointer[64];
char unused[IRQ_STACK_SIZE - 64];
};
};

DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
Expand Down
Loading

0 comments on commit 13be448

Please sign in to comment.