Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 212053
b: refs/heads/master
c: 59daa70
h: refs/heads/master
i:
  212051: 46b6e04
v: v3
  • Loading branch information
Ma Ling authored and H. Peter Anvin committed Aug 23, 2010
1 parent dd1d310 commit d8d00a8
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 60 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: fdf4289679fd41d76553ce224750e9737cd80eea
refs/heads/master: 59daa706fbec745684702741b9f5373142dd9fdc
6 changes: 2 additions & 4 deletions trunk/arch/x86/lib/memcpy_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n)
"1" (src),
"2" (dest)
:"memory");

} else {

if((src + count) < dest)
return memcpy(dest, src, count);
if((src + n) < dest)
return memcpy(dest, src, n);
else
__asm__ __volatile__(
"std\n\t"
Expand Down
158 changes: 103 additions & 55 deletions trunk/arch/x86/lib/memcpy_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,84 +40,132 @@
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
movq %rdi, %rax

/*
* Put the number of full 64-byte blocks into %ecx.
* Tail portion is handled at the end:
* Use 32bit CMP here to avoid long NOP padding.
*/
movq %rdi, %rax
movl %edx, %ecx
shrl $6, %ecx
jz .Lhandle_tail
cmp $0x20, %edx
jb .Lhandle_tail

.p2align 4
.Lloop_64:
/*
* We decrement the loop index here - and the zero-flag is
* checked at the end of the loop (instructions inbetween do
* not change the zero flag):
* We check whether memory false dependece could occur,
* then jump to corresponding copy mode.
*/
decl %ecx
cmp %dil, %sil
jl .Lcopy_backward
subl $0x20, %edx
.Lcopy_forward_loop:
subq $0x20, %rdx

/*
* Move in blocks of 4x16 bytes:
* Move in blocks of 4x8 bytes:
*/
movq 0*8(%rsi), %r11
movq 1*8(%rsi), %r8
movq %r11, 0*8(%rdi)
movq %r8, 1*8(%rdi)

movq 2*8(%rsi), %r9
movq 3*8(%rsi), %r10
movq %r9, 2*8(%rdi)
movq %r10, 3*8(%rdi)

movq 4*8(%rsi), %r11
movq 5*8(%rsi), %r8
movq %r11, 4*8(%rdi)
movq %r8, 5*8(%rdi)

movq 6*8(%rsi), %r9
movq 7*8(%rsi), %r10
movq %r9, 6*8(%rdi)
movq %r10, 7*8(%rdi)

leaq 64(%rsi), %rsi
leaq 64(%rdi), %rdi

jnz .Lloop_64
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq 2*8(%rsi), %r10
movq 3*8(%rsi), %r11
leaq 4*8(%rsi), %rsi

movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, 2*8(%rdi)
movq %r11, 3*8(%rdi)
leaq 4*8(%rdi), %rdi
jae .Lcopy_forward_loop
addq $0x20, %rdx
jmp .Lhandle_tail

.Lcopy_backward:
/*
* Calculate copy position to tail.
*/
addq %rdx, %rsi
addq %rdx, %rdi
subq $0x20, %rdx
/*
* At most 3 ALU operations in one cycle,
* so append NOPS in the same 16bytes trunk.
*/
.p2align 4
.Lcopy_backward_loop:
subq $0x20, %rdx
movq -1*8(%rsi), %r8
movq -2*8(%rsi), %r9
movq -3*8(%rsi), %r10
movq -4*8(%rsi), %r11
leaq -4*8(%rsi), %rsi
movq %r8, -1*8(%rdi)
movq %r9, -2*8(%rdi)
movq %r10, -3*8(%rdi)
movq %r11, -4*8(%rdi)
leaq -4*8(%rdi), %rdi
jae .Lcopy_backward_loop

/*
* Calculate copy position to head.
*/
addq $0x20, %rdx
subq %rdx, %rsi
subq %rdx, %rdi
.Lhandle_tail:
movl %edx, %ecx
andl $63, %ecx
shrl $3, %ecx
jz .Lhandle_7
cmpq $16, %rdx
jb .Lless_16bytes

/*
* Move data from 16 bytes to 31 bytes.
*/
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq -2*8(%rsi, %rdx), %r10
movq -1*8(%rsi, %rdx), %r11
movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
retq
.p2align 4
.Lloop_8:
decl %ecx
movq (%rsi), %r8
movq %r8, (%rdi)
leaq 8(%rdi), %rdi
leaq 8(%rsi), %rsi
jnz .Lloop_8

.Lhandle_7:
movl %edx, %ecx
andl $7, %ecx
jz .Lend
.Lless_16bytes:
cmpq $8, %rdx
jb .Lless_8bytes
/*
* Move data from 8 bytes to 15 bytes.
*/
movq 0*8(%rsi), %r8
movq -1*8(%rsi, %rdx), %r9
movq %r8, 0*8(%rdi)
movq %r9, -1*8(%rdi, %rdx)
retq
.p2align 4
.Lless_8bytes:
cmpq $4, %rdx
jb .Lless_3bytes

/*
* Move data from 4 bytes to 7 bytes.
*/
movl (%rsi), %ecx
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
retq
.p2align 4
.Lless_3bytes:
cmpl $0, %edx
je .Lend
/*
* Move data from 1 bytes to 3 bytes.
*/
.Lloop_1:
movb (%rsi), %r8b
movb %r8b, (%rdi)
incq %rdi
incq %rsi
decl %ecx
decl %edx
jnz .Lloop_1

.Lend:
ret
retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
Expand Down

0 comments on commit d8d00a8

Please sign in to comment.