Skip to content

Commit

Permalink
[PATCH] x86_64: Remove optimization for B stepping AMD K8
Browse files Browse the repository at this point in the history
B stepping were the first shipping Opterons. memcpy/memset/copy_page/
clear_page had special optimized version for them. These are really
old and in the minority now and the difference to the generic versions
(using rep microcode) is not that big anyways. So just remove them.

TODO: figure out optimized versions for Intel Netburst based EM64T

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Andi Kleen authored and Linus Torvalds committed Nov 15, 2005
1 parent a6f5deb commit a5b250a
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 310 deletions.
38 changes: 0 additions & 38 deletions arch/x86_64/lib/clear_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,8 @@
.globl clear_page
.p2align 4
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
PUT(2)
PUT(3)
PUT(4)
PUT(5)
PUT(6)
PUT(7)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
ret
clear_page_end:

/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad clear_page
.quad clear_page_c
.byte X86_FEATURE_K8_C
.byte clear_page_end-clear_page
.byte clear_page_c_end-clear_page_c
.previous

.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
clear_page_c_end:
.previous
87 changes: 0 additions & 87 deletions arch/x86_64/lib/copy_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,94 +8,7 @@
.globl copy_page
.p2align 4
copy_page:
subq $3*8,%rsp
movq %rbx,(%rsp)
movq %r12,1*8(%rsp)
movq %r13,2*8(%rsp)

movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

prefetcht0 5*64(%rsi)

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi

jnz .Loop64

movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi

jnz .Loop2

movq (%rsp),%rbx
movq 1*8(%rsp),%r12
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret

/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_K8_C
.byte copy_page_c_end-copy_page_c
.byte copy_page_c_end-copy_page_c
.previous

.section .altinstr_replacement,"ax"
copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
copy_page_c_end:
.previous
93 changes: 2 additions & 91 deletions arch/x86_64/lib/memcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -11,102 +11,15 @@
*
* Output:
* rax original destination
*
* TODO: check best memcpy for PSC
*/

.globl __memcpy
.globl memcpy
.p2align 4
__memcpy:
memcpy:
pushq %rbx
movq %rdi,%rax

movl %edx,%ecx
shrl $6,%ecx
jz .Lhandle_tail

.p2align 4
.Lloop_64:
decl %ecx

movq (%rsi),%r11
movq 8(%rsi),%r8

movq %r11,(%rdi)
movq %r8,1*8(%rdi)

movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10

movq %r9,2*8(%rdi)
movq %r10,3*8(%rdi)

movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8

movq %r11,4*8(%rdi)
movq %r8,5*8(%rdi)

movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10

movq %r9,6*8(%rdi)
movq %r10,7*8(%rdi)

leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
jnz .Lloop_64

.Lhandle_tail:
movl %edx,%ecx
andl $63,%ecx
shrl $3,%ecx
jz .Lhandle_7
.p2align 4
.Lloop_8:
decl %ecx
movq (%rsi),%r8
movq %r8,(%rdi)
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jnz .Lloop_8

.Lhandle_7:
movl %edx,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1

.Lende:
popq %rbx
ret
.Lfinal:

/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

.section .altinstructions,"a"
.align 8
.quad memcpy
.quad memcpy_c
.byte X86_FEATURE_K8_C
.byte .Lfinal-memcpy
.byte memcpy_c_end-memcpy_c
.previous

.section .altinstr_replacement,"ax"
/* rdi destination
* rsi source
* rdx count
*/
memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
Expand All @@ -117,5 +30,3 @@ memcpy_c:
rep
movsb
ret
memcpy_c_end:
.previous
94 changes: 0 additions & 94 deletions arch/x86_64/lib/memset.S
Original file line number Diff line number Diff line change
Expand Up @@ -13,98 +13,6 @@
.p2align 4
memset:
__memset:
movq %rdi,%r10
movq %rdx,%r11

/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
mul %rcx /* with rax, clobbers rdx */

/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:

movl %r11d,%ecx
shrl $6,%ecx
jz .Lhandle_tail

.p2align 4
.Lloop_64:
decl %ecx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64

/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %r11d,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8

.Lhandle_7:
movl %r11d,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
decl %ecx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1

.Lende:
movq %r10,%rax
ret

.Lbad_alignment:
cmpq $7,%r11
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment

/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad memset
.quad memset_c
.byte X86_FEATURE_K8_C
.byte memset_c_end-memset_c
.byte memset_c_end-memset_c
.previous

.section .altinstr_replacement,"ax"
/* rdi destination
* rsi value
* rdx count
*/
memset_c:
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
Expand All @@ -121,5 +29,3 @@ memset_c:
stosb
movq %r9,%rax
ret
memset_c_end:
.previous

0 comments on commit a5b250a

Please sign in to comment.