Skip to content

Commit

Permalink
Unroll x86-64 strlen
Browse files Browse the repository at this point in the history
  • Loading branch information
H.J. Lu authored and Ulrich Drepper committed Aug 27, 2010
1 parent b416a90 commit 623aac7
Show file tree
Hide file tree
Showing 6 changed files with 488 additions and 90 deletions.
11 changes: 11 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
2010-08-26 H.J. Lu <hongjiu.lu@intel.com>

* sysdeps/x86_64/strlen.S: Unroll the loop.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-sse2 strlen-sse2-bsf.
* sysdeps/x86_64/multiarch/strlen.S ((strlen): Return
__strlen_no_bsf if bit_Slow_BSF is set.
(__strlen_sse42): Removed.
* sysdeps/x86_64/multiarch/strlen-no-bsf.S: New file.
* sysdeps/x86_64/multiarch/strlen-sse4.S: New file.

2010-08-25 Roland McGrath <roland@redhat.com>

* sysdeps/x86_64/multiarch/varshift.S: File removed.
Expand Down
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3
strncase_l-ssse3 strlen-sse4 strlen-no-bsf
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
Expand Down
309 changes: 309 additions & 0 deletions sysdeps/x86_64/multiarch/strlen-no-bsf.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
/* strlen without BSF
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */

#if defined SHARED && !defined NOT_IN_libc

#include <sysdep.h>

.text
ENTRY (__strlen_no_bsf)
xor %eax, %eax
cmpb $0, (%rdi)
jz L(exit_tail0)
cmpb $0, 1(%rdi)
jz L(exit_tail1)
cmpb $0, 2(%rdi)
jz L(exit_tail2)
cmpb $0, 3(%rdi)
jz L(exit_tail3)
cmpb $0, 4(%rdi)
jz L(exit_tail4)
cmpb $0, 5(%rdi)
jz L(exit_tail5)
cmpb $0, 6(%rdi)
jz L(exit_tail6)
cmpb $0, 7(%rdi)
jz L(exit_tail7)
cmpb $0, 8(%rdi)
jz L(exit_tail8)
cmpb $0, 9(%rdi)
jz L(exit_tail9)
cmpb $0, 10(%rdi)
jz L(exit_tail10)
cmpb $0, 11(%rdi)
jz L(exit_tail11)
cmpb $0, 12(%rdi)
jz L(exit_tail12)
cmpb $0, 13(%rdi)
jz L(exit_tail13)
cmpb $0, 14(%rdi)
jz L(exit_tail14)
cmpb $0, 15(%rdi)
jz L(exit_tail15)
pxor %xmm0, %xmm0
mov %rdi, %rcx
mov %rdi, %rax
and $-16, %rax
add $16, %rax
add $16, %rcx

pcmpeqb (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)


pcmpeqb (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

pcmpeqb (%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
lea 16(%rax), %rax
jnz L(exit)

and $-0x40, %rax
xor %r8d, %r8d
L(aligned_64):
pcmpeqb (%rax), %xmm0
pcmpeqb 16(%rax), %xmm1
pcmpeqb 32(%rax), %xmm2
pcmpeqb 48(%rax), %xmm3
pmovmskb %xmm0, %edx
pmovmskb %xmm1, %esi
pmovmskb %xmm2, %edi
pmovmskb %xmm3, %r9d
or %edx, %r8d
or %esi, %r8d
or %edi, %r8d
or %r9d, %r8d
lea 64(%rax), %rax
jz L(aligned_64)

test %edx, %edx
jnz L(aligned_64_exit_16)
test %esi, %esi
jnz L(aligned_64_exit_32)
test %edi, %edi
jnz L(aligned_64_exit_48)
L(aligned_64_exit_64):
mov %r9d, %edx
jmp L(aligned_64_exit)
L(aligned_64_exit_48):
lea -16(%rax), %rax
mov %edi, %edx
jmp L(aligned_64_exit)
L(aligned_64_exit_32):
lea -32(%rax), %rax
mov %esi, %edx
jmp L(aligned_64_exit)
L(aligned_64_exit_16):
lea -48(%rax), %rax
L(aligned_64_exit):
L(exit):
sub %rcx, %rax
test %dl, %dl
jz L(exit_high)
test $0x01, %dl
jnz L(exit_tail0)

test $0x02, %dl
jnz L(exit_tail1)

test $0x04, %dl
jnz L(exit_tail2)

test $0x08, %dl
jnz L(exit_tail3)

test $0x10, %dl
jnz L(exit_tail4)

test $0x20, %dl
jnz L(exit_tail5)

test $0x40, %dl
jnz L(exit_tail6)
add $7, %eax
L(exit_tail0):
ret

L(exit_high):
add $8, %eax
test $0x01, %dh
jnz L(exit_tail0)

test $0x02, %dh
jnz L(exit_tail1)

test $0x04, %dh
jnz L(exit_tail2)

test $0x08, %dh
jnz L(exit_tail3)

test $0x10, %dh
jnz L(exit_tail4)

test $0x20, %dh
jnz L(exit_tail5)

test $0x40, %dh
jnz L(exit_tail6)
add $7, %eax
ret
.p2align 4
L(exit_tail1):
add $1, %eax
ret

L(exit_tail2):
add $2, %eax
ret

L(exit_tail3):
add $3, %eax
ret

L(exit_tail4):
add $4, %eax
ret

L(exit_tail5):
add $5, %eax
ret
L(exit_tail6):
add $6, %eax
ret
L(exit_tail7):
add $7, %eax
ret
L(exit_tail8):
add $8, %eax
ret
L(exit_tail9):
add $9, %eax
ret
L(exit_tail10):
add $10, %eax
ret
L(exit_tail11):
add $11, %eax
ret
L(exit_tail12):
add $12, %eax
ret
L(exit_tail13):
add $13, %eax
ret
L(exit_tail14):
add $14, %eax
ret
L(exit_tail15):
add $15, %eax
ret
END (__strlen_no_bsf)

#endif
Loading

0 comments on commit 623aac7

Please sign in to comment.