Skip to content

Commit

Permalink
x86/stackframe/32: Provide consistent pt_regs
Browse files Browse the repository at this point in the history
Currently pt_regs on x86_32 has an oddity in that kernel regs
(!user_mode(regs)) are short two entries (esp/ss). This means that any
code trying to use them (typically: regs->sp) needs to jump through
some unfortunate hoops.

Change the entry code to fix this up and create a full pt_regs frame.

This then simplifies various trampolines in ftrace and kprobes, the
stack unwinder, ptrace, kdump and kgdb.

Much thanks to Josh for help with the cleanups!

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Peter Zijlstra authored and Ingo Molnar committed Jun 25, 2019
1 parent ea1ed38 commit 3c88c69
Show file tree
Hide file tree
Showing 15 changed files with 177 additions and 190 deletions.
105 changes: 95 additions & 10 deletions arch/x86/entry/entry_32.S
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,102 @@
.Lend_\@:
.endm

#define CS_FROM_ENTRY_STACK (1 << 31)
#define CS_FROM_USER_CR3 (1 << 30)
#define CS_FROM_KERNEL (1 << 29)

.macro FIXUP_FRAME
/*
* The high bits of the CS dword (__csh) are used for CS_FROM_*.
* Clear them in case hardware didn't do this for us.
*/
andl $0x0000ffff, 3*4(%esp)

#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, 4*4(%esp)
jnz .Lfrom_usermode_no_fixup_\@
#endif
testl $SEGMENT_RPL_MASK, 3*4(%esp)
jnz .Lfrom_usermode_no_fixup_\@

orl $CS_FROM_KERNEL, 3*4(%esp)

/*
* When we're here from kernel mode; the (exception) stack looks like:
*
* 5*4(%esp) - <previous context>
* 4*4(%esp) - flags
* 3*4(%esp) - cs
* 2*4(%esp) - ip
* 1*4(%esp) - orig_eax
* 0*4(%esp) - gs / function
*
* Lets build a 5 entry IRET frame after that, such that struct pt_regs
* is complete and in particular regs->sp is correct. This gives us
* the original 5 enties as gap:
*
* 12*4(%esp) - <previous context>
* 11*4(%esp) - gap / flags
* 10*4(%esp) - gap / cs
* 9*4(%esp) - gap / ip
* 8*4(%esp) - gap / orig_eax
* 7*4(%esp) - gap / gs / function
* 6*4(%esp) - ss
* 5*4(%esp) - sp
* 4*4(%esp) - flags
* 3*4(%esp) - cs
* 2*4(%esp) - ip
* 1*4(%esp) - orig_eax
* 0*4(%esp) - gs / function
*/

pushl %ss # ss
pushl %esp # sp (points at ss)
addl $6*4, (%esp) # point sp back at the previous context
pushl 6*4(%esp) # flags
pushl 6*4(%esp) # cs
pushl 6*4(%esp) # ip
pushl 6*4(%esp) # orig_eax
pushl 6*4(%esp) # gs / function
.Lfrom_usermode_no_fixup_\@:
.endm

.macro IRET_FRAME
testl $CS_FROM_KERNEL, 1*4(%esp)
jz .Lfinished_frame_\@

/*
* Reconstruct the 3 entry IRET frame right after the (modified)
* regs->sp without lowering %esp in between, such that an NMI in the
* middle doesn't scribble our stack.
*/
pushl %eax
pushl %ecx
movl 5*4(%esp), %eax # (modified) regs->sp

movl 4*4(%esp), %ecx # flags
movl %ecx, -4(%eax)

movl 3*4(%esp), %ecx # cs
andl $0x0000ffff, %ecx
movl %ecx, -8(%eax)

movl 2*4(%esp), %ecx # ip
movl %ecx, -12(%eax)

movl 1*4(%esp), %ecx # eax
movl %ecx, -16(%eax)

popl %ecx
lea -16(%eax), %esp
popl %eax
.Lfinished_frame_\@:
.endm

.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
cld
PUSH_GS
FIXUP_FRAME
pushl %fs
pushl %es
pushl %ds
Expand Down Expand Up @@ -358,9 +451,6 @@
* switch to it before we do any copying.
*/

#define CS_FROM_ENTRY_STACK (1 << 31)
#define CS_FROM_USER_CR3 (1 << 30)

.macro SWITCH_TO_KERNEL_STACK

ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
Expand All @@ -374,13 +464,6 @@
* that register for the time this macro runs
*/

/*
* The high bits of the CS dword (__csh) are used for
* CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
* hardware didn't do this for us.
*/
andl $(0x0000ffff), PT_CS(%esp)

/* Are we on the entry stack? Bail out if not! */
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
Expand Down Expand Up @@ -990,6 +1073,7 @@ restore_all:
/* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code
.Lirq_return:
IRET_FRAME
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler and when returning from
Expand Down Expand Up @@ -1340,6 +1424,7 @@ END(page_fault)

common_exception:
/* the function address is in %gs's slot on the stack */
FIXUP_FRAME
pushl %fs
pushl %es
pushl %ds
Expand Down
17 changes: 0 additions & 17 deletions arch/x86/include/asm/kexec.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,6 @@ struct kimage;
#define KEXEC_BACKUP_SRC_START (0UL)
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */

/*
* CPU does not save ss and sp on stack if execution is already
* running in kernel mode at the time of NMI occurrence. This code
* fixes it.
*/
static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
struct pt_regs *oldregs)
{
#ifdef CONFIG_X86_32
newregs->sp = (unsigned long)&(oldregs->sp);
asm volatile("xorl %%eax, %%eax\n\t"
"movw %%ss, %%ax\n\t"
:"=a"(newregs->ss));
#endif
}

/*
* This function is responsible for capturing register states if coming
* via panic otherwise just fix up the ss and sp if coming via kernel
Expand All @@ -96,7 +80,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
{
if (oldregs) {
memcpy(newregs, oldregs, sizeof(*newregs));
crash_fixup_ss_esp(newregs, oldregs);
} else {
#ifdef CONFIG_X86_32
asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
Expand Down
17 changes: 2 additions & 15 deletions arch/x86/include/asm/ptrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,10 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#define compat_user_stack_pointer() current_pt_regs()->sp
#endif

#ifdef CONFIG_X86_32
extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
#else
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
#endif

#define GET_IP(regs) ((regs)->ip)
#define GET_FP(regs) ((regs)->bp)
Expand Down Expand Up @@ -201,14 +197,6 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
if (unlikely(offset > MAX_REG_OFFSET))
return 0;
#ifdef CONFIG_X86_32
/*
* Traps from the kernel do not save sp and ss.
* Use the helper function to retrieve sp.
*/
if (offset == offsetof(struct pt_regs, sp) &&
regs->cs == __KERNEL_CS)
return kernel_stack_pointer(regs);

/* The selector fields are 16-bit. */
if (offset == offsetof(struct pt_regs, cs) ||
offset == offsetof(struct pt_regs, ss) ||
Expand All @@ -234,8 +222,7 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
static inline int regs_within_kernel_stack(struct pt_regs *regs,
unsigned long addr)
{
return ((addr & ~(THREAD_SIZE - 1)) ==
(kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
}

/**
Expand All @@ -249,7 +236,7 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs,
*/
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
{
unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
unsigned long *addr = (unsigned long *)regs->sp;

addr += n;
if (regs_within_kernel_stack(regs, (unsigned long)addr))
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/stacktrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ static inline unsigned long *
get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
{
if (regs)
return (unsigned long *)kernel_stack_pointer(regs);
return (unsigned long *)regs->sp;

if (task == current)
return __builtin_frame_address(0);
Expand Down
8 changes: 0 additions & 8 deletions arch/x86/kernel/crash.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)

static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;

if (!user_mode(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
#endif
crash_save_cpu(regs, cpu);

/*
Expand Down
75 changes: 42 additions & 33 deletions arch/x86/kernel/ftrace_32.S
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <asm/ftrace.h>
#include <asm/nospec-branch.h>
#include <asm/frame.h>
#include <asm/asm-offsets.h>

# define function_hook __fentry__
EXPORT_SYMBOL(__fentry__)
Expand Down Expand Up @@ -90,26 +91,38 @@ END(ftrace_caller)

ENTRY(ftrace_regs_caller)
/*
* i386 does not save SS and ESP when coming from kernel.
* Instead, to get sp, &regs->sp is used (see ptrace.h).
* Unfortunately, that means eflags must be at the same location
* as the current return ip is. We move the return ip into the
* regs->ip location, and move flags into the return ip location.
* We're here from an mcount/fentry CALL, and the stack frame looks like:
*
* <previous context>
* RET-IP
*
* The purpose of this function is to call out in an emulated INT3
* environment with a stack frame like:
*
* <previous context>
* gap / RET-IP
* gap
* gap
* gap
* pt_regs
*
* We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds
*/
pushl $__KERNEL_CS
pushl 4(%esp) /* Save the return ip */
pushl $0 /* Load 0 into orig_ax */
subl $3*4, %esp # RET-IP + 3 gaps
pushl %ss # ss
pushl %esp # points at ss
addl $5*4, (%esp) # make it point at <previous context>
pushfl # flags
pushl $__KERNEL_CS # cs
pushl 7*4(%esp) # ip <- RET-IP
pushl $0 # orig_eax

pushl %gs
pushl %fs
pushl %es
pushl %ds
pushl %eax

/* Get flags and place them into the return ip slot */
pushf
popl %eax
movl %eax, 8*4(%esp)

pushl %eax
pushl %ebp
pushl %edi
pushl %esi
Expand All @@ -119,39 +132,35 @@ ENTRY(ftrace_regs_caller)

ENCODE_FRAME_POINTER

movl 12*4(%esp), %eax /* Load ip (1st parameter) */
subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */
movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
pushl %esp /* Save pt_regs as 4th parameter */
movl PT_EIP(%esp), %eax # 1st argument: IP
subl $MCOUNT_INSN_SIZE, %eax
movl 21*4(%esp), %edx # 2nd argument: parent ip
movl function_trace_op, %ecx # 3rd argument: ftrace_pos
pushl %esp # 4th argument: pt_regs

GLOBAL(ftrace_regs_call)
call ftrace_stub

addl $4, %esp /* Skip pt_regs */
addl $4, %esp # skip 4th argument

/* restore flags */
push 14*4(%esp)
popf
/* place IP below the new SP */
movl PT_OLDESP(%esp), %eax
movl PT_EIP(%esp), %ecx
movl %ecx, -4(%eax)

/* Move return ip back to its original location */
movl 12*4(%esp), %eax
movl %eax, 14*4(%esp)
/* place EAX below that */
movl PT_EAX(%esp), %ecx
movl %ecx, -8(%eax)

popl %ebx
popl %ecx
popl %edx
popl %esi
popl %edi
popl %ebp
popl %eax
popl %ds
popl %es
popl %fs
popl %gs

/* use lea to not affect flags */
lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
lea -8(%eax), %esp
popl %eax

jmp .Lftrace_ret

Expand Down
8 changes: 0 additions & 8 deletions arch/x86/kernel/kgdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,6 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)

#ifdef CONFIG_X86_32
switch (regno) {
case GDB_SS:
if (!user_mode(regs))
*(unsigned long *)mem = __KERNEL_DS;
break;
case GDB_SP:
if (!user_mode(regs))
*(unsigned long *)mem = kernel_stack_pointer(regs);
break;
case GDB_GS:
case GDB_FS:
*(unsigned long *)mem = 0xFFFF;
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kernel/kprobes/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@
" popl %edi\n" \
" popl %ebp\n" \
" popl %eax\n" \
/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
" addl $24, %esp\n"
/* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\
" addl $7*4, %esp\n"
#endif

/* Ensure if the instruction can be boostable */
Expand Down
Loading

0 comments on commit 3c88c69

Please sign in to comment.