From 29dedee0e693aa113164c820395ce51446a71ace Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 May 2014 16:38:18 +0200 Subject: [PATCH 01/10] uprobes: Add mem_cgroup_charge_anon() into uprobe_write_opcode() Hugh says: The one I noticed was that it forgets all about memcg (because it was copied from KSM, and there the replacement page has already been charged to a memcg). See how mm/memory.c do_anonymous_page() does a mem_cgroup_charge_anon(). Hopefully not a big problem, uprobes is a system-wide thing and only root can insert the probes. But I agree, should be fixed anyway. Add mem_cgroup_{un,}charge_anon() into uprobe_write_opcode(). To simplify the error handling (and avoid the new "uncharge" label) the patch also moves anon_vma_prepare() up before we alloc/charge the new page. While at it fix the comment about ->mmap_sem, it is held for write. Suggested-by: Hugh Dickins Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7716c40f2c50..a13251e8bfa4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -279,18 +279,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. - */ - -/* + * * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). - * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, @@ -310,21 +305,25 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, if (ret <= 0) goto put_old; + ret = anon_vma_prepare(vma); + if (ret) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; - __SetPageUptodate(new_page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + goto put_new; + __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); - ret = anon_vma_prepare(vma); - if (ret) - goto put_new; - ret = __replace_page(vma, vaddr, old_page, new_page); + if (ret) + mem_cgroup_uncharge_page(new_page); put_new: page_cache_release(new_page); From 50204c6f6dd01b5bce1b53e0b003d01849455512 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 1 May 2014 16:52:46 +0200 Subject: [PATCH 02/10] uprobes/x86: Simplify rip-relative handling It is possible to replace rip-relative addressing mode with addressing mode of the same length: (reg+disp32). This eliminates the need to fix up immediate and correct for changing instruction length. And we can kill arch_uprobe->def.riprel_target. Signed-off-by: Denys Vlasenko Reviewed-by: Jim Keniston Signed-off-by: Oleg Nesterov --- arch/x86/include/asm/uprobes.h | 3 -- arch/x86/kernel/uprobes.c | 71 ++++++++++++++-------------------- 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index a040d493a4f9..7be3c079e389 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -50,9 +50,6 @@ struct arch_uprobe { u8 opc1; } branch; struct { -#ifdef CONFIG_X86_64 - long riprel_target; -#endif u8 fixups; u8 ilen; } def; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ebadb252093..31dcb4d5ea46 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -251,9 +251,9 @@ static inline bool is_64bit_mm(struct mm_struct *mm) * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set - * def->fixups and def->riprel_target accordingly. (The contents of the - * scratch register will be saved before we single-step the modified - * instruction, and restored afterward). + * def->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL @@ -264,9 +264,12 @@ static inline bool is_64bit_mm(struct mm_struct *mm) * * Some useful facts about rip-relative instructions: * - * - There's always a modrm byte. + * - There's always a modrm byte with bit layout "00 reg 101". * - There's never a SIB byte. * - The displacement is always 4 bytes. + * - REX.B=1 bit in REX prefix, which normally extends r/m field, + * has no effect on rip-relative mode. It doesn't make modrm byte + * with r/m=101 refer to register 1101 = R13. */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { @@ -293,9 +296,8 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) */ cursor = auprobe->insn + insn_offset_modrm(insn); /* - * Convert from rip-relative addressing to indirect addressing - * via a scratch register. Change the r/m field from 0x5 (%rip) - * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + * Convert from rip-relative addressing + * to register-relative addressing via a scratch register. */ reg = MODRM_REG(insn); if (reg == 0) { @@ -307,22 +309,21 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) * #1) for the scratch register. */ auprobe->def.fixups |= UPROBE_FIX_RIP_CX; - /* Change modrm from 00 000 101 to 00 000 001. */ - *cursor = 0x1; + /* + * Change modrm from "00 000 101" to "10 000 001". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 81 disp32 mov %eax,disp32(%rcx) + */ + *cursor = 0x81; } else { /* Use %rax (register #0) for the scratch register. */ auprobe->def.fixups |= UPROBE_FIX_RIP_AX; - /* Change modrm from 00 xxx 101 to 00 xxx 000 */ - *cursor = (reg << 3); - } - - /* Target address = address of next instruction + (signed) offset */ - auprobe->def.riprel_target = (long)insn->length + insn->displacement.value; - - /* Displacement field is gone; slide immediate field (if any) over. */ - if (insn->immediate.nbytes) { - cursor++; - memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); + /* + * Change modrm from "00 reg 101" to "10 reg 000". Example: + * 89 1d disp32 mov %edx,disp32(%rip) becomes + * 89 98 disp32 mov %edx,disp32(%rax) + */ + *cursor = (reg << 3) | 0x80; } } @@ -343,26 +344,17 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long *sr = scratch_reg(auprobe, regs); utask->autask.saved_scratch_register = *sr; - *sr = utask->vaddr + auprobe->def.riprel_target; + *sr = utask->vaddr + auprobe->def.ilen; } } -static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, - long *correction) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); *sr = utask->autask.saved_scratch_register; - /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Caller may need to apply other fixups to handle stuff - * like "jmpq *...(%rip)" and "callq *...(%rip)". - */ - if (correction) - *correction += 4; } } #else /* 32-bit: */ @@ -379,8 +371,7 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } -static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, - long *correction) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } #endif /* CONFIG_X86_64 */ @@ -417,10 +408,10 @@ static int push_ret_address(struct pt_regs *regs, unsigned long ip) static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; - long correction = (long)(utask->vaddr - utask->xol_vaddr); - riprel_post_xol(auprobe, regs, &correction); + riprel_post_xol(auprobe, regs); if (auprobe->def.fixups & UPROBE_FIX_IP) { + long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->def.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(); @@ -436,7 +427,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - riprel_post_xol(auprobe, regs, NULL); + riprel_post_xol(auprobe, regs); } static struct uprobe_xol_ops default_xol_ops = { @@ -732,11 +723,9 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) * * If the original instruction was a rip-relative instruction such as * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,(%rax)". - * We need to restore the contents of the scratch register and adjust - * the ip, keeping in mind that the instruction we executed is 4 bytes - * shorter than the original instruction (since we squeezed out the offset - * field). (FIX_RIP_AX or FIX_RIP_CX) + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rax)". + * We need to restore the contents of the scratch register + * (FIX_RIP_AX or FIX_RIP_CX). */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { From 1ea30fb64598bd3a6ba43d874bb53c55878eaef5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 May 2014 17:04:00 +0200 Subject: [PATCH 03/10] uprobes/x86: Fix scratch register selection for rip-relative fixups Before this patch, instructions such as div, mul, shifts with count in CL, cmpxchg are mishandled. This patch adds vex prefix handling. In particular, it avoids colliding with register operand encoded in vex.vvvv field. Since we need to avoid two possible register operands, the selection of scratch register needs to be from at least three registers. After looking through a lot of CPU docs, it looks like the safest choice is SI,DI,BX. Selecting BX needs care to not collide with implicit use of BX by cmpxchg8b. Test-case: #include static const char *const pass[] = { "FAIL", "pass" }; long two = 2; void test1(void) { long ax = 0, dx = 0; asm volatile("\n" " xor %%edx,%%edx\n" " lea 2(%%edx),%%eax\n" // We divide 2 by 2. Result (in eax) should be 1: " probe1: .globl probe1\n" " divl two(%%rip)\n" // If we have a bug (eax mangled on entry) the result will be 2, // because eax gets restored by probe machinery. : "=a" (ax), "=d" (dx) /*out*/ : "0" (ax), "1" (dx) /*in*/ : "memory" /*clobber*/ ); dprintf(2, "%s: %s\n", __func__, pass[ax == 1] ); } long val2 = 0; void test2(void) { long old_val = val2; long ax = 0, dx = 0; asm volatile("\n" " mov val2,%%eax\n" // eax := val2 " lea 1(%%eax),%%edx\n" // edx := eax+1 // eax is equal to val2. cmpxchg should store edx to val2: " probe2: .globl probe2\n" " cmpxchg %%edx,val2(%%rip)\n" // If we have a bug (eax mangled on entry), val2 will stay unchanged : "=a" (ax), "=d" (dx) /*out*/ : "0" (ax), "1" (dx) /*in*/ : "memory" /*clobber*/ ); dprintf(2, "%s: %s\n", __func__, pass[val2 == old_val + 1] ); } long val3[2] = {0,0}; void test3(void) { long old_val = val3[0]; long ax = 0, dx = 0; asm volatile("\n" " mov val3,%%eax\n" // edx:eax := val3 " mov val3+4,%%edx\n" " mov %%eax,%%ebx\n" // ecx:ebx := edx:eax + 1 " mov %%edx,%%ecx\n" " add $1,%%ebx\n" " adc $0,%%ecx\n" // edx:eax is equal to val3. cmpxchg8b should store ecx:ebx to val3: " probe3: .globl probe3\n" " cmpxchg8b val3(%%rip)\n" // If we have a bug (edx:eax mangled on entry), val3 will stay unchanged. // If ecx:edx in mangled, val3 will get wrong value. : "=a" (ax), "=d" (dx) /*out*/ : "0" (ax), "1" (dx) /*in*/ : "cx", "bx", "memory" /*clobber*/ ); dprintf(2, "%s: %s\n", __func__, pass[val3[0] == old_val + 1 && val3[1] == 0] ); } int main(int argc, char **argv) { test1(); test2(); test3(); return 0; } Before this change all tests fail if probe{1,2,3} are probed. Signed-off-by: Denys Vlasenko Reviewed-by: Jim Keniston Signed-off-by: Oleg Nesterov --- arch/x86/kernel/uprobes.c | 176 +++++++++++++++++++++++++++----------- 1 file changed, 125 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 31dcb4d5ea46..159ca520ef5b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -41,8 +41,11 @@ /* Instruction will modify TF, don't change it */ #define UPROBE_FIX_SETF 0x04 -#define UPROBE_FIX_RIP_AX 0x08 -#define UPROBE_FIX_RIP_CX 0x10 +#define UPROBE_FIX_RIP_SI 0x08 +#define UPROBE_FIX_RIP_DI 0x10 +#define UPROBE_FIX_RIP_BX 0x20 +#define UPROBE_FIX_RIP_MASK \ + (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) #define UPROBE_TRAP_NR UINT_MAX @@ -275,20 +278,109 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; + u8 reg2; if (!insn_rip_relative(insn)) return; /* - * insn_rip_relative() would have decoded rex_prefix, modrm. + * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. * Clear REX.b bit (extension of MODRM.rm field): - * we want to encode rax/rcx, not r8/r9. + * we want to encode low numbered reg, not r8+. */ if (insn->rex_prefix.nbytes) { cursor = auprobe->insn + insn_offset_rex_prefix(insn); - *cursor &= 0xfe; /* Clearing REX.B bit */ + /* REX byte has 0100wrxb layout, clearing REX.b bit */ + *cursor &= 0xfe; } + /* + * Similar treatment for VEX3 prefix. + * TODO: add XOP/EVEX treatment when insn decoder supports them + */ + if (insn->vex_prefix.nbytes == 3) { + /* + * vex2: c5 rvvvvLpp (has no b bit) + * vex3/xop: c4/8f rxbmmmmm wvvvvLpp + * evex: 62 rxbR00mm wvvvv1pp zllBVaaa + * (evex will need setting of both b and x since + * in non-sib encoding evex.x is 4th bit of MODRM.rm) + * Setting VEX3.b (setting because it has inverted meaning): + */ + cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; + *cursor |= 0x20; + } + + /* + * Convert from rip-relative addressing to register-relative addressing + * via a scratch register. + * + * This is tricky since there are insns with modrm byte + * which also use registers not encoded in modrm byte: + * [i]div/[i]mul: implicitly use dx:ax + * shift ops: implicitly use cx + * cmpxchg: implicitly uses ax + * cmpxchg8/16b: implicitly uses dx:ax and bx:cx + * Encoding: 0f c7/1 modrm + * The code below thinks that reg=1 (cx), chooses si as scratch. + * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. + * First appeared in Haswell (BMI2 insn). It is vex-encoded. + * Example where none of bx,cx,dx can be used as scratch reg: + * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx + * [v]pcmpistri: implicitly uses cx, xmm0 + * [v]pcmpistrm: implicitly uses xmm0 + * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 + * [v]pcmpestrm: implicitly uses ax, dx, xmm0 + * Evil SSE4.2 string comparison ops from hell. + * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. + * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. + * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). + * AMD says it has no 3-operand form (vex.vvvv must be 1111) + * and that it can have only register operands, not mem + * (its modrm byte must have mode=11). + * If these restrictions will ever be lifted, + * we'll need code to prevent selection of di as scratch reg! + * + * Summary: I don't know any insns with modrm byte which + * use SI register implicitly. DI register is used only + * by one insn (maskmovq) and BX register is used + * only by one too (cmpxchg8b). + * BP is stack-segment based (may be a problem?). + * AX, DX, CX are off-limits (many implicit users). + * SP is unusable (it's stack pointer - think about "pop mem"; + * also, rsp+disp32 needs sib encoding -> insn length change). + */ + reg = MODRM_REG(insn); /* Fetch modrm.reg */ + reg2 = 0xff; /* Fetch vex.vvvv */ + if (insn->vex_prefix.nbytes == 2) + reg2 = insn->vex_prefix.bytes[1]; + else if (insn->vex_prefix.nbytes == 3) + reg2 = insn->vex_prefix.bytes[2]; + /* + * TODO: add XOP, EXEV vvvv reading. + * + * vex.vvvv field is in bits 6-3, bits are inverted. + * But in 32-bit mode, high-order bit may be ignored. + * Therefore, let's consider only 3 low-order bits. + */ + reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; + /* + * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. + * + * Choose scratch reg. Order is important: must not select bx + * if we can use si (cmpxchg8b case!) + */ + if (reg != 6 && reg2 != 6) { + reg2 = 6; + auprobe->def.fixups |= UPROBE_FIX_RIP_SI; + } else if (reg != 7 && reg2 != 7) { + reg2 = 7; + auprobe->def.fixups |= UPROBE_FIX_RIP_DI; + /* TODO (paranoia): force maskmovq to not use di */ + } else { + reg2 = 3; + auprobe->def.fixups |= UPROBE_FIX_RIP_BX; + } /* * Point cursor at the modrm byte. The next 4 bytes are the * displacement. Beyond the displacement, for some instructions, @@ -296,41 +388,21 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) */ cursor = auprobe->insn + insn_offset_modrm(insn); /* - * Convert from rip-relative addressing - * to register-relative addressing via a scratch register. + * Change modrm from "00 reg 101" to "10 reg reg2". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 86 disp32 mov %eax,disp32(%rsi) */ - reg = MODRM_REG(insn); - if (reg == 0) { - /* - * The register operand (if any) is either the A register - * (%rax, %eax, etc.) or (if the 0x4 bit is set in the - * REX prefix) %r8. In any case, we know the C register - * is NOT the register operand, so we use %rcx (register - * #1) for the scratch register. - */ - auprobe->def.fixups |= UPROBE_FIX_RIP_CX; - /* - * Change modrm from "00 000 101" to "10 000 001". Example: - * 89 05 disp32 mov %eax,disp32(%rip) becomes - * 89 81 disp32 mov %eax,disp32(%rcx) - */ - *cursor = 0x81; - } else { - /* Use %rax (register #0) for the scratch register. */ - auprobe->def.fixups |= UPROBE_FIX_RIP_AX; - /* - * Change modrm from "00 reg 101" to "10 reg 000". Example: - * 89 1d disp32 mov %edx,disp32(%rip) becomes - * 89 98 disp32 mov %edx,disp32(%rax) - */ - *cursor = (reg << 3) | 0x80; - } + *cursor = 0x80 | (reg << 3) | reg2; } static inline unsigned long * scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return (auprobe->def.fixups & UPROBE_FIX_RIP_AX) ? ®s->ax : ®s->cx; + if (auprobe->def.fixups & UPROBE_FIX_RIP_SI) + return ®s->si; + if (auprobe->def.fixups & UPROBE_FIX_RIP_DI) + return ®s->di; + return ®s->bx; } /* @@ -339,7 +411,7 @@ scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) */ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + if (auprobe->def.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); @@ -350,7 +422,7 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->def.fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + if (auprobe->def.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); @@ -405,6 +477,23 @@ static int push_ret_address(struct pt_regs *regs, unsigned long ip) return 0; } +/* + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg). + */ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; @@ -711,21 +800,6 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) * single-step, we single-stepped a copy of the instruction. * * This function prepares to resume execution after the single-step. - * We have to fix things up as follows: - * - * Typically, the new ip is relative to the copied instruction. We need - * to make it relative to the original instruction (FIX_IP). Exceptions - * are return instructions and absolute or indirect jump or call instructions. - * - * If the single-stepped instruction was a call, the return address that - * is atop the stack is the address following the copied instruction. We - * need to make it the address following the original instruction (FIX_CALL). - * - * If the original instruction was a rip-relative instruction such as - * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rax)". - * We need to restore the contents of the scratch register - * (FIX_RIP_AX or FIX_RIP_CX). */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { From 5e1b05beeca8139204324581a5b1ffb53d057f96 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 May 2014 20:34:00 +0200 Subject: [PATCH 04/10] x86/traps: Make math_error() static Trivial, make math_error() static. Signed-off-by: Oleg Nesterov --- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/traps.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 58d66fe06b61..a7b212db9e04 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -98,7 +98,6 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; -void math_error(struct pt_regs *, int, int); void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6b8c62..8eddd628326e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -488,7 +488,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int error_code, int trapnr) { struct task_struct *task = current; siginfo_t info; From 38cad57be9800e46c52a3612fb9d963eee4fd9c3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 7 May 2014 16:47:09 +0200 Subject: [PATCH 05/10] x86/traps: Use SEND_SIG_PRIV instead of force_sig() force_sig() is just force_sig_info(SEND_SIG_PRIV). Imho it should die, we have too many ugly "send signal" helpers. And do_trap() looks just ugly because it uses force_sig_info() or force_sig() depending on info != NULL. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8eddd628326e..2cd429117a41 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -168,10 +168,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, } #endif - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); + force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -305,7 +302,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_cont("\n"); } - force_sig(SIGSEGV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); exit: exception_exit(prev_state); } @@ -645,7 +642,7 @@ void math_state_restore(void) */ if (unlikely(restore_fpu_checking(tsk))) { drop_init_fpu(tsk); - force_sig(SIGSEGV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); return; } From dff0796e53c29147c9bd1f5567a261dcf0e528bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 7 May 2014 17:21:34 +0200 Subject: [PATCH 06/10] x86/traps: Introduce do_error_trap() Move the common code from DO_ERROR() and DO_ERROR_INFO() into the new helper, do_error_trap(). This simplifies define's and shaves 527 bytes from traps.o. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2cd429117a41..ab8dad719b9e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -171,41 +171,37 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, + unsigned long trapnr, int signr, siginfo_t *info) +{ + enum ctx_state prev_state = exception_enter(); + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != + NOTIFY_STOP) { + conditional_sti(regs); + do_trap(trapnr, signr, str, regs, error_code, info); + } + + exception_exit(prev_state); +} + #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - enum ctx_state prev_state; \ - \ - prev_state = exception_enter(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, \ - trapnr, signr) == NOTIFY_STOP) { \ - exception_exit(prev_state); \ - return; \ - } \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ - exception_exit(prev_state); \ + do_error_trap(regs, error_code, str, trapnr, signr, NULL); \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ - enum ctx_state prev_state; \ \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - prev_state = exception_enter(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, \ - trapnr, signr) == NOTIFY_STOP) { \ - exception_exit(prev_state); \ - return; \ - } \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ - exception_exit(prev_state); \ + \ + do_error_trap(regs, error_code, str, trapnr, signr, &info); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) From 958d3d729802f7d741cbe8400e69b89baae580ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 7 May 2014 17:59:39 +0200 Subject: [PATCH 07/10] x86/traps: Introduce fill_trap_info(), simplify DO_ERROR_INFO() Extract the fill-siginfo code from DO_ERROR_INFO() into the new helper, fill_trap_info(). It can calculate si_code and si_addr looking at trapnr, so we can remove these arguments from DO_ERROR_INFO() and simplify the source code. The generated code is the same, __builtin_constant_p(trapnr) == T. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 53 +++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ab8dad719b9e..1cf8a4c409b6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -136,6 +136,33 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } +static void fill_trap_info(struct pt_regs *regs, int signr, int trapnr, + siginfo_t *info) +{ + unsigned long siaddr; + int sicode; + + switch (trapnr) { + case X86_TRAP_DE: + sicode = FPE_INTDIV; + siaddr = regs->ip; + break; + case X86_TRAP_UD: + sicode = ILL_ILLOPN; + siaddr = regs->ip; + break; + case X86_TRAP_AC: + sicode = BUS_ADRALN; + siaddr = 0; + break; + } + + info->si_signo = signr; + info->si_errno = 0; + info->si_code = sicode; + info->si_addr = (void __user *)siaddr; +} + static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -191,30 +218,26 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - \ + fill_trap_info(regs, signr, trapnr, &info); \ do_error_trap(regs, error_code, str, trapnr, signr, &info); \ } -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) -DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow ) -DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds ) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip ) -DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun ) -DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS ) -DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present ) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error) +DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) +DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment ) +DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 ) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ From 1c326c4dfe182a1c4c1e39f2c00f04c380d11692 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 May 2014 20:04:11 +0200 Subject: [PATCH 08/10] x86/traps: Shift fill_trap_info() from DO_ERROR_INFO() to do_error_trap() Move the callsite of fill_trap_info() into do_error_trap() and remove the "siginfo_t *info" argument. This obviously breaks DO_ERROR() which passed info == NULL, we simply change fill_trap_info() to return "siginfo_t *" and add the "default" case which returns SEND_SIG_PRIV. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1cf8a4c409b6..c9dd74124038 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -136,13 +136,16 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } -static void fill_trap_info(struct pt_regs *regs, int signr, int trapnr, - siginfo_t *info) +static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, + siginfo_t *info) { unsigned long siaddr; int sicode; switch (trapnr) { + default: + return SEND_SIG_PRIV; + case X86_TRAP_DE: sicode = FPE_INTDIV; siaddr = regs->ip; @@ -161,6 +164,7 @@ static void fill_trap_info(struct pt_regs *regs, int signr, int trapnr, info->si_errno = 0; info->si_code = sicode; info->si_addr = (void __user *)siaddr; + return info; } static void __kprobes @@ -199,14 +203,16 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, } static void do_error_trap(struct pt_regs *regs, long error_code, char *str, - unsigned long trapnr, int signr, siginfo_t *info) + unsigned long trapnr, int signr) { enum ctx_state prev_state = exception_enter(); + siginfo_t info; if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { conditional_sti(regs); - do_trap(trapnr, signr, str, regs, error_code, info); + do_trap(trapnr, signr, str, regs, error_code, + fill_trap_info(regs, signr, trapnr, &info)); } exception_exit(prev_state); @@ -215,16 +221,13 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - do_error_trap(regs, error_code, str, trapnr, signr, NULL); \ + do_error_trap(regs, error_code, str, trapnr, signr); \ } #define DO_ERROR_INFO(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - siginfo_t info; \ - \ - fill_trap_info(regs, signr, trapnr, &info); \ - do_error_trap(regs, error_code, str, trapnr, signr, &info); \ + do_error_trap(regs, error_code, str, trapnr, signr); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error) From 0eb14833d5b1ea1accfeffb71be5de5929f85da9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 May 2014 20:12:24 +0200 Subject: [PATCH 09/10] x86/traps: Kill DO_ERROR_INFO() Now that DO_ERROR_INFO() doesn't differ from DO_ERROR() we can remove it and use DO_ERROR() instead. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/traps.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c9dd74124038..73b3ea32245a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -224,23 +224,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr); \ -} - -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error) -DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) -DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ From b02ef20a9fba08948e643d3eec0efadf1da01a44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 12 May 2014 18:24:45 +0200 Subject: [PATCH 10/10] uprobes/x86: Fix the wrong ->si_addr when xol triggers a trap If the probed insn triggers a trap, ->si_addr = regs->ip is technically correct, but this is not what the signal handler wants; we need to pass the address of the probed insn, not the address of xol slot. Add the new arch-agnostic helper, uprobe_get_trap_addr(), and change fill_trap_info() and math_error() to use it. !CONFIG_UPROBES case in uprobes.h uses a macro to avoid include hell and ensure that it can be compiled even if an architecture doesn't define instruction_pointer(). Test-case: #include #include #include extern void probe_div(void); void sigh(int sig, siginfo_t *info, void *c) { int passed = (info->si_addr == probe_div); printf(passed ? "PASS\n" : "FAIL\n"); _exit(!passed); } int main(void) { struct sigaction sa = { .sa_sigaction = sigh, .sa_flags = SA_SIGINFO, }; sigaction(SIGFPE, &sa, NULL); asm ( "xor %ecx,%ecx\n" ".globl probe_div; probe_div:\n" "idiv %ecx\n" ); return 0; } it fails if probe_div() is probed. Note: show_unhandled_signals users should probably use this helper too, but we need to cleanup them first. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu --- arch/x86/kernel/traps.c | 7 ++++--- include/linux/uprobes.h | 4 ++++ kernel/events/uprobes.c | 10 ++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 73b3ea32245a..3fdb20548c4b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -148,11 +149,11 @@ static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, case X86_TRAP_DE: sicode = FPE_INTDIV; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_UD: sicode = ILL_ILLOPN; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_AC: sicode = BUS_ADRALN; @@ -531,7 +532,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_addr = (void __user *)regs->ip; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b97b864..88c3b7e8b384 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,7 @@ extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, u extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_trap_insn(uprobe_opcode_t *insn); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -130,6 +131,9 @@ extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *r #else /* !CONFIG_UPROBES */ struct uprobes_state { }; + +#define uprobe_get_trap_addr(regs) instruction_pointer(regs) + static inline int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a13251e8bfa4..3b02c72938a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread.