From 0f4840be2528b3e3f2ecea009ab08e753701e9be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20B=C3=ADlka?= Date: Tue, 18 Mar 2014 11:01:38 +0100 Subject: [PATCH] Use strspn/strcspn/strpbrk ifunc in internal calls. To make a strtok faster and improve performance in general we need to do one additional change. A comment: /* It doesn't make sense to send libc-internal strcspn calls through a PLT. The speedup we get from using SSE4.2 instruction is likely eaten away by the indirect call in the PLT. */ Does not make sense at all because nobody bothered to check it. Gap between these implementations is quite big, when haystack is empty a sse2 is around 40 cycles slower because it needs to populate a lookup table and difference only increases with size. That is much bigger than plt slowdown which is few cycles. Even benchtest show a gap which also may be reverse by branch misprediction but my internal benchmark shown. simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 This patch also handles strpbrk which is implemented by including a x86_64/multiarch/strcspn.S file. * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. * sysdeps/x86_64/multiarch/strcspn.S: Likewise. --- ChangeLog | 5 +++++ sysdeps/x86_64/multiarch/strcspn.S | 7 ------- sysdeps/x86_64/multiarch/strspn.S | 6 ------ 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index 85234b3559..f2f2447d22 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2015-05-12 Ondřej Bílka + + * sysdeps/x86_64/multiarch/strcspn.S: Remove plt indirection. + * sysdeps/x86_64/multiarch/strspn.S: Likewise. + 2015-05-12 Roland McGrath * posix/uname-values.h: New file. diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S index 00e46173ae..95e882c443 100644 --- a/sysdeps/x86_64/multiarch/strcspn.S +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -65,14 +65,7 @@ END(STRCSPN) # undef END # define END(name) \ cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 #endif - #endif /* HAVE_SSE4_SUPPORT */ #ifdef USE_AS_STRPBRK diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S index aea8e4c945..b734c1729a 100644 --- a/sysdeps/x86_64/multiarch/strspn.S +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -50,12 +50,6 @@ END(strspn) # undef END # define END(name) \ cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strspn calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strspn; __GI_strspn = __strspn_sse2 #endif #endif /* HAVE_SSE4_SUPPORT */