Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 14215
b: refs/heads/master
c: a5b250a
h: refs/heads/master
i:
  14213: 0f41c84
  14211: 2ef12ad
  14207: 66c348e
v: v3
  • Loading branch information
Andi Kleen authored and Linus Torvalds committed Nov 15, 2005
1 parent b02843d commit 832a0d0
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 311 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: a6f5deb2be4c82f24fefadcbf7e448f540c05ae6
refs/heads/master: a5b250a428aabc619ace872f8220a7d0b8f7d557
38 changes: 0 additions & 38 deletions trunk/arch/x86_64/lib/clear_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,8 @@
.globl clear_page
.p2align 4
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
PUT(2)
PUT(3)
PUT(4)
PUT(5)
PUT(6)
PUT(7)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
ret
clear_page_end:

/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad clear_page
.quad clear_page_c
.byte X86_FEATURE_K8_C
.byte clear_page_end-clear_page
.byte clear_page_c_end-clear_page_c
.previous

.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
clear_page_c_end:
.previous
87 changes: 0 additions & 87 deletions trunk/arch/x86_64/lib/copy_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,94 +8,7 @@
.globl copy_page
.p2align 4
copy_page:
subq $3*8,%rsp
movq %rbx,(%rsp)
movq %r12,1*8(%rsp)
movq %r13,2*8(%rsp)

movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

prefetcht0 5*64(%rsi)

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi

jnz .Loop64

movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi

jnz .Loop2

movq (%rsp),%rbx
movq 1*8(%rsp),%r12
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret

/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_K8_C
.byte copy_page_c_end-copy_page_c
.byte copy_page_c_end-copy_page_c
.previous

.section .altinstr_replacement,"ax"
copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
copy_page_c_end:
.previous
93 changes: 2 additions & 91 deletions trunk/arch/x86_64/lib/memcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -11,102 +11,15 @@
*
* Output:
* rax original destination
*
* TODO: check best memcpy for PSC
*/

.globl __memcpy
.globl memcpy
.p2align 4
__memcpy:
memcpy:
pushq %rbx
movq %rdi,%rax

movl %edx,%ecx
shrl $6,%ecx
jz .Lhandle_tail

.p2align 4
.Lloop_64:
decl %ecx

movq (%rsi),%r11
movq 8(%rsi),%r8

movq %r11,(%rdi)
movq %r8,1*8(%rdi)

movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10

movq %r9,2*8(%rdi)
movq %r10,3*8(%rdi)

movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8

movq %r11,4*8(%rdi)
movq %r8,5*8(%rdi)

movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10

movq %r9,6*8(%rdi)
movq %r10,7*8(%rdi)

leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
jnz .Lloop_64

.Lhandle_tail:
movl %edx,%ecx
andl $63,%ecx
shrl $3,%ecx
jz .Lhandle_7
.p2align 4
.Lloop_8:
decl %ecx
movq (%rsi),%r8
movq %r8,(%rdi)
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jnz .Lloop_8

.Lhandle_7:
movl %edx,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1

.Lende:
popq %rbx
ret
.Lfinal:

/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

.section .altinstructions,"a"
.align 8
.quad memcpy
.quad memcpy_c
.byte X86_FEATURE_K8_C
.byte .Lfinal-memcpy
.byte memcpy_c_end-memcpy_c
.previous

.section .altinstr_replacement,"ax"
/* rdi destination
* rsi source
* rdx count
*/
memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
Expand All @@ -117,5 +30,3 @@ memcpy_c:
rep
movsb
ret
memcpy_c_end:
.previous
94 changes: 0 additions & 94 deletions trunk/arch/x86_64/lib/memset.S
Original file line number Diff line number Diff line change
Expand Up @@ -13,98 +13,6 @@
.p2align 4
memset:
__memset:
movq %rdi,%r10
movq %rdx,%r11

/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
mul %rcx /* with rax, clobbers rdx */

/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:

movl %r11d,%ecx
shrl $6,%ecx
jz .Lhandle_tail

.p2align 4
.Lloop_64:
decl %ecx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64

/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %r11d,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8

.Lhandle_7:
movl %r11d,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
decl %ecx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1

.Lende:
movq %r10,%rax
ret

.Lbad_alignment:
cmpq $7,%r11
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment

/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad memset
.quad memset_c
.byte X86_FEATURE_K8_C
.byte memset_c_end-memset_c
.byte memset_c_end-memset_c
.previous

.section .altinstr_replacement,"ax"
/* rdi destination
* rsi value
* rdx count
*/
memset_c:
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
Expand All @@ -121,5 +29,3 @@ memset_c:
stosb
movq %r9,%rax
ret
memset_c_end:
.previous

0 comments on commit 832a0d0

Please sign in to comment.