From e52c1dc7455d32c8a55f9949d300e5e87d011fa6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Apr 2025 09:57:31 +0200 Subject: [PATCH] x86/its: FineIBT-paranoid vs ITS FineIBT-paranoid was using the retpoline bytes for the paranoid check, disabling retpolines, because all parts that have IBT also have eIBRS and thus don't need no stinking retpolines. Except... ITS needs the retpolines for indirect calls must not be in the first half of a cacheline :-/ So what was the paranoid call sequence: : 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d a: 4d 8d 5b lea -0x10(%r11), %r11 e: 75 fd jne d 10: 41 ff d3 call *%r11 13: 90 nop Now becomes: : 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d a: 4d 8d 5b f0 lea -0x10(%r11), %r11 e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 Where the paranoid_thunk looks like: 1d: (bad) __x86_indirect_paranoid_thunk_r11: 1e: 75 fd jne 1d __x86_indirect_its_thunk_r11: 20: 41 ff eb jmp *%r11 23: cc int3 [ dhansen: remove initialization to false ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- arch/x86/include/asm/alternative.h | 8 ++ arch/x86/kernel/alternative.c | 145 +++++++++++++++++++++++++---- arch/x86/lib/retpoline.S | 15 ++- arch/x86/net/bpf_jit_comp.c | 2 +- tools/objtool/arch/x86/decode.c | 9 ++ 5 files changed, 159 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 47948ebbb409..f2294784babc 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -6,6 +6,7 @@ #include #include #include +#include #define ALT_FLAGS_SHIFT 16 @@ -128,10 +129,17 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, extern void its_init_mod(struct module *mod); extern void its_fini_mod(struct module *mod); extern void its_free_mod(struct module *mod); +extern u8 *its_static_thunk(int reg); #else /* CONFIG_MITIGATION_ITS */ static inline void its_init_mod(struct module *mod) { } static inline void its_fini_mod(struct module *mod) { } static inline void its_free_mod(struct module *mod) { } +static inline u8 *its_static_thunk(int reg) +{ + WARN_ONCE(1, "ITS not compiled in"); + + return NULL; +} #endif #if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7a10e3ed5d0b..48fd04e90114 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -127,6 +127,10 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = #endif }; +#ifdef CONFIG_FINEIBT +static bool cfi_paranoid __ro_after_init; +#endif + #ifdef CONFIG_MITIGATION_ITS static struct module *its_mod; @@ -137,8 +141,25 @@ static unsigned int its_offset; static void *its_init_thunk(void *thunk, int reg) { u8 *bytes = thunk; + int offset = 0; int i = 0; +#ifdef CONFIG_FINEIBT + if (cfi_paranoid) { + /* + * When ITS uses indirect branch thunk the fineibt_paranoid + * caller sequence doesn't fit in the caller site. So put the + * remaining part of the sequence ( + JNE) into the ITS + * thunk. + */ + bytes[i++] = 0xea; /* invalid instruction */ + bytes[i++] = 0x75; /* JNE */ + bytes[i++] = 0xfd; + + offset = 1; + } +#endif + if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; @@ -147,7 +168,7 @@ static void *its_init_thunk(void *thunk, int reg) bytes[i++] = 0xe0 + reg; /* jmp *reg */ bytes[i++] = 0xcc; - return thunk; + return thunk + offset; } void its_init_mod(struct module *mod) @@ -217,6 +238,17 @@ static void *its_allocate_thunk(int reg) int size = 3 + (reg / 8); void *thunk; +#ifdef CONFIG_FINEIBT + /* + * The ITS thunk contains an indirect jump and an int3 instruction so + * its size is 3 or 4 bytes depending on the register used. If CFI + * paranoid is used then 3 extra bytes are added in the ITS thunk to + * complete the fineibt_paranoid caller sequence. + */ + if (cfi_paranoid) + size += 3; +#endif + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { its_page = its_alloc(); if (!its_page) { @@ -240,6 +272,18 @@ static void *its_allocate_thunk(int reg) return its_init_thunk(thunk, reg); } +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + +#ifdef CONFIG_FINEIBT + /* Paranoid thunk starts 2 bytes before */ + if (cfi_paranoid) + return thunk - 2; +#endif + return thunk; +} + #endif /* @@ -775,8 +819,17 @@ static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) /* Lower-half of the cacheline? */ return !(addr & 0x20); } +#else /* CONFIG_MITIGATION_ITS */ + +#ifdef CONFIG_FINEIBT +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + return false; +} #endif +#endif /* CONFIG_MITIGATION_ITS */ + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -893,6 +946,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) int len, ret; u8 bytes[16]; u8 op1, op2; + u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) @@ -909,6 +963,12 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: + /* Check for cfi_paranoid + ITS */ + dest = addr + insn.length + insn.immediate.value; + if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + } break; case 0x0f: /* escape */ @@ -1198,8 +1258,6 @@ int cfi_get_func_arity(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; -static bool cfi_paranoid __ro_after_init = false; - /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1612,6 +1670,19 @@ static int cfi_rand_callers(s32 *start, s32 *end) return 0; } +static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; + +#ifdef CONFIG_MITIGATION_ITS + u8 *tmp = its_allocate_thunk(reg); + if (tmp) + thunk = tmp; +#endif + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; @@ -1653,9 +1724,14 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); memcpy(bytes + fineibt_caller_hash, &hash, 4); - ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); - if (WARN_ON_ONCE(ret != 3)) - continue; + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { + emit_paranoid_trampoline(addr + fineibt_caller_size, + &insn, 11, bytes + fineibt_caller_size); + } else { + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + } text_poke_early(addr, bytes, fineibt_paranoid_size); } @@ -1882,29 +1958,66 @@ static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 return false; } +static bool is_paranoid_thunk(unsigned long addr) +{ + u32 thunk; + + __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); + return (thunk & 0x00FFFFFF) == 0xfd75ea; + +Efault: + return false; +} + /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] - * sequence. + * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS + * thunk. */ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_paranoid_ud; - u32 hash; - if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + if (!cfi_paranoid) return false; - __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); - *target = regs->r11 + fineibt_preamble_size; - *type = regs->r10; + if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + } /* - * Since the trapping instruction is the exact, but LOCK prefixed, - * Jcc.d8 that got us here, the normal fixup will work. + * The cfi_paranoid + ITS thunk combination results in: + * + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 + * + * Where the paranoid_thunk looks like: + * + * 1d: (bad) + * __x86_indirect_paranoid_thunk_r11: + * 1e: 75 fd jne 1d + * __x86_indirect_its_thunk_r11: + * 20: 41 ff eb jmp *%r11 + * 23: cc int3 + * */ - return true; + if (is_paranoid_thunk(regs->ip)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + regs->ip = *target; + return true; + } -Efault: return false; } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ebca28fe7e31..39374949daa2 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -371,6 +371,15 @@ SYM_FUNC_END(call_depth_return_thunk) .macro ITS_THUNK reg +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR @@ -378,19 +387,19 @@ SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) jmp *%\reg int3 .align 32, 0xcc /* fill to the end of the line */ - .skip 32, 0xcc /* skip to the next upper half */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ .endm /* ITS mitigation requires thunks be aligned to upper half of cacheline */ .align 64, 0xcc -.skip 32, 0xcc -SYM_CODE_START(__x86_indirect_its_thunk_array) +.skip 29, 0xcc #define GEN(reg) ITS_THUNK reg #include #undef GEN .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) SYM_CODE_END(__x86_indirect_its_thunk_array) .align 64, 0xcc diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a5b65c09910b..a31e58c6d89e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -663,7 +663,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_its_thunk_array[reg], ip); + emit_jump(&prog, its_static_thunk(reg), ip); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 3ce7b54003c2..687c5eafb49a 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -189,6 +189,15 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec op2 = ins.opcode.bytes[1]; op3 = ins.opcode.bytes[2]; + /* + * XXX hack, decoder is buggered and thinks 0xea is 7 bytes long. + */ + if (op1 == 0xea) { + insn->len = 1; + insn->type = INSN_BUG; + return 0; + } + if (ins.rex_prefix.nbytes) { rex = ins.rex_prefix.bytes[0]; rex_w = X86_REX_W(rex) >> 3;