Skip to content

Commit

Permalink
Merge branch 'x86-rep-insns': x86 user copy clarifications
Browse files Browse the repository at this point in the history
Merge my x86 user copy updates branch.

This cleans up a lot of our x86 memory copy code, particularly for user
accesses.  I've been pushing for microarchitectural support for good
memory copying and clearing for a long while, and it's been visible in
how the kernel has aggressively used 'rep movs' and 'rep stos' whenever
possible.

And that micro-architectural support has been improving over the years,
to the point where on modern CPU's the best option for a memory copy
that would become a function call (as opposed to being something that
can just be turned into individual 'mov' instructions) is now to inline
the string instruction sequence instead.

However, that only makes sense when we have the modern markers for this:
the x86 FSRM and FSRS capabilities ("Fast Short REP MOVS/STOS").

So this cleans up a lot of our historical code, gets rid of the legacy
marker use ("REP_GOOD" and "ERMS") from the memcpy/memset cases, and
replaces it with that modern reality.  Note that REP_GOOD and ERMS end
up still being used by the known large cases (ie page copyin gand
clearing).

The reason much of this ends up being about user memory accesses is that
the normal in-kernel cases are done by the compiler (__builtin_memcpy()
and __builtin_memset()) and getting to the point where we can use our
instruction rewriting to inline those to be string instructions will
need some compiler support.

In contrast, the user accessor functions are all entirely controlled by
the kernel code, so we can change those arbitrarily.

Thanks to Borislav Petkov for feedback on the series, and Jens testing
some of this on micro-architectures I didn't personally have access to.

* x86-rep-insns:
  x86: rewrite '__copy_user_nocache' function
  x86: remove 'zerorest' argument from __copy_user_nocache()
  x86: set FSRS automatically on AMD CPUs that have FSRM
  x86: improve on the non-rep 'copy_user' function
  x86: improve on the non-rep 'clear_user' function
  x86: inline the 'rep movs' in user copies for the FSRM case
  x86: move stac/clac from user copy routines into callers
  x86: don't use REP_GOOD or ERMS for user memory clearing
  x86: don't use REP_GOOD or ERMS for user memory copies
  x86: don't use REP_GOOD or ERMS for small memory clearing
  x86: don't use REP_GOOD or ERMS for small memory copies
  • Loading branch information
Linus Torvalds committed Apr 24, 2023
2 parents 487c20b + 034ff37 commit a562456
Show file tree
Hide file tree
Showing 11 changed files with 458 additions and 604 deletions.
62 changes: 25 additions & 37 deletions arch/x86/include/asm/uaccess_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,26 @@

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
rep_movs_alternative(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len)
copy_user_generic(void *to, const void *from, unsigned long len)
{
unsigned ret;

stac();
/*
* If CPU has ERMS feature, use copy_user_enhanced_fast_string.
* Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
* Otherwise, use copy_user_generic_unrolled.
* If CPU has FSRM feature, use 'rep movs'.
* Otherwise, use rep_movs_alternative.
*/
alternative_call_2(copy_user_generic_unrolled,
copy_user_generic_string,
X86_FEATURE_REP_GOOD,
copy_user_enhanced_fast_string,
X86_FEATURE_ERMS,
ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
"=d" (len)),
"1" (to), "2" (from), "3" (len)
: "memory", "rcx", "r8", "r9", "r10", "r11");
return ret;
asm volatile(
"1:\n\t"
ALTERNATIVE("rep movsb",
"call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
: : "memory", "rax", "r8", "r9", "r10", "r11");
clac();
return len;
}

static __always_inline __must_check unsigned long
Expand All @@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
return copy_user_generic((__force void *)dst, src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);

extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len);
Expand All @@ -69,8 +61,12 @@ static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size)
{
long ret;
kasan_check_write(dst, size);
return __copy_user_nocache(dst, src, size, 0);
stac();
ret = __copy_user_nocache(dst, src, size);
clac();
return ret;
}

static inline int
Expand All @@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
*/

__must_check unsigned long
clear_user_original(void __user *addr, unsigned long len);
__must_check unsigned long
clear_user_rep_good(void __user *addr, unsigned long len);
__must_check unsigned long
clear_user_erms(void __user *addr, unsigned long len);
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
Expand All @@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
*/
asm volatile(
"1:\n\t"
ALTERNATIVE_3("rep stosb",
"call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM),
"call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
"call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
ALTERNATIVE("rep stosb",
"call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
: "a" (0)
/* rep_good clobbers %rdx */
: "rdx");
: "a" (0));

clac();

Expand Down
4 changes: 4 additions & 0 deletions arch/x86/kernel/cpu/amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);

/* AMD FSRM also implies FSRS */
if (cpu_has(c, X86_FEATURE_FSRM))
set_cpu_cap(c, X86_FEATURE_FSRS);

/* get apicid instead of initial apic id from cpuid */
c->apicid = hard_smp_processor_id();

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
endif
lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o
lib-y += copy_user_64.o copy_user_uncached_64.o
lib-y += cmpxchg16b_emu.o
endif
183 changes: 67 additions & 116 deletions arch/x86/lib/clear_page_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* Input:
* rdi destination
* rcx count
* rax is zero
*
* Output:
* rcx: uncleared bytes or 0 if successful.
*/
SYM_FUNC_START(clear_user_original)
/*
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
* i.e., no need for a 'q' suffix and thus a REX prefix.
*/
mov %ecx,%eax
shr $3,%rcx
jz .Lrest_bytes
SYM_FUNC_START(rep_stos_alternative)
cmpq $64,%rcx
jae .Lunrolled

# do the qwords first
.p2align 4
.Lqwords:
movq $0,(%rdi)
lea 8(%rdi),%rdi
dec %rcx
jnz .Lqwords
cmp $8,%ecx
jae .Lword

.Lrest_bytes:
and $7, %eax
jz .Lexit
testl %ecx,%ecx
je .Lexit

# now do the rest bytes
.Lbytes:
movb $0,(%rdi)
.Lclear_user_tail:
0: movb %al,(%rdi)
inc %rdi
dec %eax
jnz .Lbytes

dec %rcx
jnz .Lclear_user_tail
.Lexit:
/*
* %rax still needs to be cleared in the exception case because this function is called
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
* in case it might reuse it somewhere.
*/
xor %eax,%eax
RET

.Lqwords_exception:
# convert remaining qwords back into bytes to return to caller
shl $3, %rcx
and $7, %eax
add %rax,%rcx
jmp .Lexit

.Lbytes_exception:
mov %eax,%ecx
jmp .Lexit

_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
SYM_FUNC_END(clear_user_original)
EXPORT_SYMBOL(clear_user_original)

/*
* Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
* present.
* Input:
* rdi destination
* rcx count
*
* Output:
* rcx: uncleared bytes or 0 if successful.
*/
SYM_FUNC_START(clear_user_rep_good)
# call the original thing for less than a cacheline
cmp $64, %rcx
jb clear_user_original

.Lprep:
# copy lower 32-bits for rest bytes
mov %ecx, %edx
shr $3, %rcx
jz .Lrep_good_rest_bytes

.Lrep_good_qwords:
rep stosq

.Lrep_good_rest_bytes:
and $7, %edx
jz .Lrep_good_exit

.Lrep_good_bytes:
mov %edx, %ecx
rep stosb

.Lrep_good_exit:
# see .Lexit comment above
xor %eax, %eax
RET

.Lrep_good_qwords_exception:
# convert remaining qwords back into bytes to return to caller
shl $3, %rcx
and $7, %edx
add %rdx, %rcx
jmp .Lrep_good_exit
_ASM_EXTABLE_UA( 0b, .Lexit)

_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
SYM_FUNC_END(clear_user_rep_good)
EXPORT_SYMBOL(clear_user_rep_good)
.Lword:
1: movq %rax,(%rdi)
addq $8,%rdi
sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lclear_user_tail

/*
* Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
* Input:
* rdi destination
* rcx count
*
* Output:
* rcx: uncleared bytes or 0 if successful.
*
*/
SYM_FUNC_START(clear_user_erms)
# call the original thing for less than a cacheline
cmp $64, %rcx
jb clear_user_original

.Lerms_bytes:
rep stosb

.Lerms_exit:
xorl %eax,%eax
.p2align 4
.Lunrolled:
10: movq %rax,(%rdi)
11: movq %rax,8(%rdi)
12: movq %rax,16(%rdi)
13: movq %rax,24(%rdi)
14: movq %rax,32(%rdi)
15: movq %rax,40(%rdi)
16: movq %rax,48(%rdi)
17: movq %rax,56(%rdi)
addq $64,%rdi
subq $64,%rcx
cmpq $64,%rcx
jae .Lunrolled
cmpl $8,%ecx
jae .Lword
testl %ecx,%ecx
jne .Lclear_user_tail
RET

_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
SYM_FUNC_END(clear_user_erms)
EXPORT_SYMBOL(clear_user_erms)
/*
* If we take an exception on any of the
* word stores, we know that %rcx isn't zero,
* so we can just go to the tail clearing to
* get the exact count.
*
* The unrolled case might end up clearing
* some bytes twice. Don't care.
*
* We could use the value in %rdi to avoid
* a second fault on the exact count case,
* but do we really care? No.
*
* Finally, we could try to align %rdi at the
* top of the unrolling. But unaligned stores
* just aren't that common or expensive.
*/
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
EXPORT_SYMBOL(rep_stos_alternative)
Loading

0 comments on commit a562456

Please sign in to comment.