Skip to content

Commit

Permalink
x86: inline the 'rep movs' in user copies for the FSRM case
Browse files Browse the repository at this point in the history
This does the same thing for the user copies as commit 0db7058
("x86/clear_user: Make it faster") did for clear_user().  In other
words, it inlines the "rep movs" case when X86_FEATURE_FSRM is set,
avoiding the function call entirely.

In order to do that, it makes the calling convention for the out-of-line
case ("copy_user_generic_unrolled") match the 'rep movs' calling
convention, although it does also end up clobbering a number of
additional registers.

Also, to simplify code sharing in the low-level assembly with the
__copy_user_nocache() function (that uses the normal C calling
convention), we end up with a kind of mixed return value for the
low-level asm code: it will return the result in both %rcx (to work as
an alternative for the 'rep movs' case), _and_ in %rax (for the nocache
case).

We could avoid this by wrapping __copy_user_nocache() callers in an
inline asm, but since the cost is just an extra register copy, it's
probably not worth it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Linus Torvalds committed Apr 19, 2023
1 parent 3639a53 commit 577e6a7
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 48 deletions.
23 changes: 10 additions & 13 deletions arch/x86/include/asm/uaccess_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,26 @@

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
copy_user_fast_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len)
copy_user_generic(void *to, const void *from, unsigned long len)
{
unsigned ret;

stac();
/*
* If CPU has FSRM feature, use 'rep movs'.
* Otherwise, use copy_user_generic_unrolled.
*/
alternative_call(copy_user_generic_unrolled,
copy_user_fast_string,
X86_FEATURE_FSRM,
ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
"=d" (len)),
"1" (to), "2" (from), "3" (len)
: "memory", "rcx", "r8", "r9", "r10", "r11");
asm volatile(
"1:\n\t"
ALTERNATIVE("rep movsb",
"call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM))
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
: : "memory", "rax", "rdx", "r8", "r9", "r10", "r11");
clac();
return ret;
return len;
}

static __always_inline __must_check unsigned long
Expand Down
55 changes: 21 additions & 34 deletions arch/x86/lib/copy_user_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,29 @@
* Input:
* rdi destination
* rsi source
* rdx count
* rcx count
*
* Output:
* eax uncopied bytes or 0 if successful.
* rcx uncopied bytes or 0 if successful.
*
* NOTE! The calling convention is very intentionally the same as
* for 'rep movs', so that we can rewrite the function call with
* just a plain 'rep movs' on machines that have FSRM.
*
* HOWEVER! This function ends up having a lot of the code common
* with __copy_user_nocache(), which is a normal C function, and
* has a similar calling convention, but gets the 'count' in %rdx,
* and returns the result in %rax.
*
* To share as much code as possible, we end up returning the
* result in *both* %rcx/%rax, and we also move the initial count
* into %rdx.
*
* We can clobber rdx/rsi/rdi and r8-r11
*/
SYM_FUNC_START(copy_user_generic_unrolled)
cmpl $8,%edx
movl %ecx,%edx
cmpl $8,%ecx
jb .Lcopy_user_short_string_bytes
ALIGN_DESTINATION
movl %edx,%ecx
Expand Down Expand Up @@ -103,37 +119,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)

/*
* Some CPUs support FSRM for Fast Short REP MOVS.
*
* Only 4GB of copy is supported. This shouldn't be a problem
* because the kernel normally only writes from/to page sized chunks
* even if user space passed a longer buffer.
* And more would be dangerous because both Intel and AMD have
* errata with rep movsq > 4GB. If someone feels the need to fix
* this please consider this.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
SYM_FUNC_START(copy_user_fast_string)
movl %edx,%ecx
1: rep movsb
xorl %eax,%eax
RET

12: movl %ecx,%eax /* ecx is zerorest also */
RET

_ASM_EXTABLE_CPY(1b, 12b)
SYM_FUNC_END(copy_user_fast_string)
EXPORT_SYMBOL(copy_user_fast_string)

/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
Expand All @@ -160,6 +145,7 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)

3:
movl %edx,%eax
movl %edx,%ecx
RET

_ASM_EXTABLE_CPY(1b, 2b)
Expand Down Expand Up @@ -203,6 +189,7 @@ SYM_CODE_START_LOCAL(copy_user_short_string)
decl %ecx
jnz 21b
23: xor %eax,%eax
xor %ecx,%ecx
RET

40: leal (%rdx,%rcx,8),%edx
Expand Down
1 change: 0 additions & 1 deletion tools/objtool/check.c
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,6 @@ static const char *uaccess_safe_builtin[] = {
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
"clear_user_original",
"copy_user_generic_unrolled",
"copy_user_fast_string",
"__copy_user_nocache",
NULL
};
Expand Down

0 comments on commit 577e6a7

Please sign in to comment.