x86/its: Align RETs in BHB clear sequence to avoid thunking

The software mitigation for BHI is to execute BHB clear sequence at syscall entry, and possibly after a cBPF program. ITS mitigation thunks RETs in the lower half of the cacheline. This causes the RETs in the BHB clear sequence to be thunked as well, adding unnecessary branches to the BHB clear sequence. Since the sequence is in hot path, align the RET instructions in the sequence to avoid thunking. This is how disassembly clear_bhb_loop() looks like after this change: 0x44 <+4>: mov $0x5,%ecx 0x49 <+9>: call 0xffffffff81001d9b <clear_bhb_loop+91> 0x4e <+14>: jmp 0xffffffff81001de5 <clear_bhb_loop+165> 0x53 <+19>: int3 ... 0x9b <+91>: call 0xffffffff81001dce <clear_bhb_loop+142> 0xa0 <+96>: ret 0xa1 <+97>: int3 ... 0xce <+142>: mov $0x5,%eax 0xd3 <+147>: jmp 0xffffffff81001dd6 <clear_bhb_loop+150> 0xd5 <+149>: nop 0xd6 <+150>: sub $0x1,%eax 0xd9 <+153>: jne 0xffffffff81001dd3 <clear_bhb_loop+147> 0xdb <+155>: sub $0x1,%ecx 0xde <+158>: jne 0xffffffff81001d9b <clear_bhb_loop+91> 0xe0 <+160>: ret 0xe1 <+161>: int3 0xe2 <+162>: int3 0xe3 <+163>: int3 0xe4 <+164>: int3 0xe5 <+165>: lfence 0xe8 <+168>: pop %rbp 0xe9 <+169>: ret Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
mariux64 · May 9, 2025 · f0cd709 · f0cd709
1 parent facd226
commit f0cd709
Showing 1 changed file with 17 additions and 3 deletions.
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
@@ -1525,7 +1525,9 @@ SYM_CODE_END(rewind_stack_and_make_dead)
  * ORC to unwind properly.
  *
  * The alignment is for performance and not for safety, and may be safely
- * refactored in the future if needed.
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
  */
 SYM_FUNC_START(clear_bhb_loop)
 	ANNOTATE_NOENDBR
@@ -1536,18 +1538,30 @@ SYM_FUNC_START(clear_bhb_loop)
 	call	1f
 	jmp	5f
 	.align 64, 0xcc
+	/*
+	 * Shift instructions so that the RET is in the upper half of the
+	 * cacheline and don't take the slowpath to its_return_thunk.
+	 */
+	.skip 32 - (.Lret1 - 1f), 0xcc
 	ANNOTATE_INTRA_FUNCTION_CALL
 1:	call	2f
-	RET
+.Lret1:	RET
 	.align 64, 0xcc
+	/*
+	 * As above shift instructions for RET at .Lret2 as well.
+	 *
+	 * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+	 * but some Clang versions (e.g. 18) don't like this.
+	 */
+	.skip 32 - 18, 0xcc
 2:	movl	$5, %eax
 3:	jmp	4f
 	nop
 4:	sub	$1, %eax
 	jnz	3b
 	sub	$1, %ecx
 	jnz	1b
-	RET
+.Lret2:	RET
 5:	lfence
 	pop	%rbp
 	RET