Skip to content

Commit

Permalink
Optimize 32bit memset/memcpy with SSE2/SSSE3.
Browse files Browse the repository at this point in the history
  • Loading branch information
H.J. Lu authored and Ulrich Drepper committed Jan 12, 2010
1 parent 4bfc6ab commit 3af48cb
Show file tree
Hide file tree
Showing 36 changed files with 6,336 additions and 15 deletions.
62 changes: 62 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,65 @@
2010-01-12 H.J. Lu <hongjiu.lu@intel.com>

* sysdeps/i386/i686/bcopy.S: New file.

* sysdeps/i386/i686/cacheinfo.c (__x86_64_data_cache_size): Define.

* sysdeps/i386/i686/memcpy.S (__memcpy_chk): Use ENTRY_CHK
and END_CHK.
* sysdeps/i386/i686/memmove.S (__memmove_chk): Likewise.
* sysdeps/i386/i686/mempcpy.S (__mempcpy_chk): Likewise.
* sysdeps/i386/i686/memset.S (__memset_chk): Likewise.

* sysdeps/i386/i686/memmove.S: Support USE_AS_BCOPY.

* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 memmove-ssse3
memcpy-ssse3-rep mempcpy-ssse3-rep memmove-ssse3-rep
bcopy-ssse3 bcopy-ssse3-rep memset-sse2-rep bzero-sse2-rep
* sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S: New file.
* sysdeps/i386/i686/multiarch/bcopy-ssse3.S: New file.
* sysdeps/i386/i686/multiarch/bcopy.S: New file.
* sysdeps/i386/i686/multiarch/bzero-sse2-rep.S: New file.
* sysdeps/i386/i686/multiarch/bzero-sse2.S: New file.
* sysdeps/i386/i686/multiarch/bzero.S: New file.
* sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: New file.
* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: New file.
* sysdeps/i386/i686/multiarch/memcpy.S: New file.
* sysdeps/i386/i686/multiarch/memcpy_chk.S: New file.
* sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S: New file.
* sysdeps/i386/i686/multiarch/memmove-ssse3.S: New file.
* sysdeps/i386/i686/multiarch/memmove.S: New file.
* sysdeps/i386/i686/multiarch/memmove_chk.S: New file.
* sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S: New file.
* sysdeps/i386/i686/multiarch/mempcpy-ssse3.S: New file.
* sysdeps/i386/i686/multiarch/mempcpy.S: New file.
* sysdeps/i386/i686/multiarch/mempcpy_chk.S: New file.
* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: New file.
* sysdeps/i386/i686/multiarch/memset-sse2.S: New file.
* sysdeps/i386/i686/multiarch/memset.S: New file.
* sysdeps/i386/i686/multiarch/memset_chk.S: New file.

* sysdeps/i386/sysdep.h (ENTRY_CHK): New.
(END_CHK): Likewise.

* sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add
FEATURE_OFFSET, FEATURE_SIZE and FEATURE_INDEX_1.
* sysdeps/x86_64/multiarch/ifunc-defines.sym: Likewise.

* sysdeps/x86_64/cacheinfo.c (intel_02_cache_info): Add entries
for 0x0e and 0x80.
(__x86_64_data_cache_size): New.
(init_cacheinfo): Set __x86_64_data_cache_size.

* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Turn
on bit_Fast_Rep_String for Intel Core i7.

* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Rep_String): New.
(index_Fast_Rep_String): Likewise.
(FEATURE_INDEX_1): Likewise.
(FEATURE_INDEX_MAX): Likewise.
(cpu_features): Add feature.

2010-01-12 Ulrich Drepper <drepper@redhat.com>

* conform/data/sys/select.h-data: Fix up for XPG7.
Expand Down
3 changes: 3 additions & 0 deletions sysdeps/i386/i686/bcopy.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#define USE_AS_BCOPY
#define memmove bcopy
#include <sysdeps/i386/i686/memmove.S>
1 change: 1 addition & 0 deletions sysdeps/i386/i686/cacheinfo.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#define __x86_64_data_cache_size __x86_data_cache_size
#define __x86_64_data_cache_size_half __x86_data_cache_size_half
#define __x86_64_shared_cache_size __x86_shared_cache_size
#define __x86_64_shared_cache_size_half __x86_shared_cache_size_half
Expand Down
4 changes: 2 additions & 2 deletions sysdeps/i386/i686/memcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@

.text
#if defined PIC && !defined NOT_IN_libc
ENTRY (__memcpy_chk)
ENTRY_CHK (__memcpy_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memcpy_chk)
END_CHK (__memcpy_chk)
#endif
ENTRY (BP_SYM (memcpy))
ENTER
Expand Down
27 changes: 21 additions & 6 deletions sysdeps/i386/i686/memmove.S
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,27 @@

#define PARMS LINKAGE+4 /* one spilled register */
#define RTN PARMS
#define DEST RTN+RTN_SIZE
#define SRC DEST+PTR_SIZE
#define LEN SRC+PTR_SIZE

.text
#if defined PIC && !defined NOT_IN_libc
ENTRY (__memmove_chk)

#ifdef USE_AS_BCOPY
# define SRC RTN+RTN_SIZE
# define DEST SRC+PTR_SIZE
# define LEN DEST+PTR_SIZE
#else
# define DEST RTN+RTN_SIZE
# define SRC DEST+PTR_SIZE
# define LEN SRC+PTR_SIZE

# if defined PIC && !defined NOT_IN_libc
ENTRY_CHK (__memmove_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk)
END_CHK (__memmove_chk)
# endif
#endif

ENTRY (BP_SYM (memmove))
ENTER

Expand Down Expand Up @@ -69,8 +78,10 @@ ENTRY (BP_SYM (memmove))
movsl
movl %edx, %esi
cfi_restore (esi)
#ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
RETURN_BOUNDED_POINTER (DEST(%esp))
#endif

popl %edi
cfi_adjust_cfa_offset (-4)
Expand Down Expand Up @@ -101,8 +112,10 @@ ENTRY (BP_SYM (memmove))
movsl
movl %edx, %esi
cfi_restore (esi)
#ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
RETURN_BOUNDED_POINTER (DEST(%esp))
#endif

cld
popl %edi
Expand All @@ -112,4 +125,6 @@ ENTRY (BP_SYM (memmove))
LEAVE
RET_PTR
END (BP_SYM (memmove))
#ifndef USE_AS_BCOPY
libc_hidden_builtin_def (memmove)
#endif
4 changes: 2 additions & 2 deletions sysdeps/i386/i686/mempcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@

.text
#if defined PIC && !defined NOT_IN_libc
ENTRY (__mempcpy_chk)
ENTRY_CHK (__mempcpy_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk)
END_CHK (__mempcpy_chk)
#endif
ENTRY (BP_SYM (__mempcpy))
ENTER
Expand Down
4 changes: 2 additions & 2 deletions sysdeps/i386/i686/memset.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@

.text
#if defined PIC && !defined NOT_IN_libc && !BZERO_P
ENTRY (__memset_chk)
ENTRY_CHK (__memset_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memset_chk)
END_CHK (__memset_chk)
#endif
ENTRY (BP_SYM (memset))
ENTER
Expand Down
4 changes: 4 additions & 0 deletions sysdeps/i386/i686/multiarch/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ gen-as-const-headers += ifunc-defines.sym
endif

ifeq ($(subdir),string)
sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
memset-sse2-rep bzero-sse2-rep
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
Expand Down
4 changes: 4 additions & 0 deletions sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#define USE_AS_MEMMOVE
#define USE_AS_BCOPY
#define MEMCPY __bcopy_ssse3_rep
#include "memcpy-ssse3-rep.S"
4 changes: 4 additions & 0 deletions sysdeps/i386/i686/multiarch/bcopy-ssse3.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#define USE_AS_MEMMOVE
#define USE_AS_BCOPY
#define MEMCPY __bcopy_ssse3
#include "memcpy-ssse3.S"
89 changes: 89 additions & 0 deletions sysdeps/i386/i686/multiarch/bcopy.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/* Multiple versions of bcopy
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */

#include <sysdep.h>
#include <init-arch.h>

/* Define multiple versions only for the definition in lib. */
#ifndef NOT_IN_libc
# ifdef SHARED
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret

.text
ENTRY(bcopy)
.type bcopy, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __bcopy_ia32@GOTOFF(%ebx), %eax
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __bcopy_ssse3@GOTOFF(%ebx), %eax
testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __bcopy_ssse3_rep@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4)
cfi_restore (ebx)
ret
END(bcopy)
# else
.text
ENTRY(bcopy)
.type bcopy, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features
jne 1f
call __init_cpu_features
1: leal __bcopy_ia32, %eax
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
jz 2f
leal __bcopy_ssse3, %eax
testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
jz 2f
leal __bcopy_ssse3_rep, %eax
2: ret
END(bcopy)
# endif

# undef ENTRY
# define ENTRY(name) \
.type __bcopy_ia32, @function; \
.p2align 4; \
__bcopy_ia32: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32

#endif

#include "../bcopy.S"
3 changes: 3 additions & 0 deletions sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#define USE_AS_BZERO
#define __memset_sse2_rep __bzero_sse2_rep
#include "memset-sse2-rep.S"
3 changes: 3 additions & 0 deletions sysdeps/i386/i686/multiarch/bzero-sse2.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#define USE_AS_BZERO
#define __memset_sse2 __bzero_sse2
#include "memset-sse2.S"
97 changes: 97 additions & 0 deletions sysdeps/i386/i686/multiarch/bzero.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/* Multiple versions of bzero
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */

#include <sysdep.h>
#include <init-arch.h>

/* Define multiple versions only for the definition in lib. */
#ifndef NOT_IN_libc
# ifdef SHARED
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret

.text
ENTRY(__bzero)
.type __bzero, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __bzero_ia32@GOTOFF(%ebx), %eax
testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __bzero_sse2@GOTOFF(%ebx), %eax
testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __bzero_sse2_rep@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4)
cfi_restore (ebx)
ret
END(__bzero)
# else
.text
ENTRY(__bzero)
.type __bzero, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features
jne 1f
call __init_cpu_features
1: leal __bzero_ia32, %eax
testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
jz 2f
leal __bzero_sse2, %eax
testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
jz 2f
leal __bzero_sse2_rep, %eax
2: ret
END(__bzero)
# endif

# undef ENTRY
# define ENTRY(name) \
.type __bzero_ia32, @function; \
.p2align 4; \
__bzero_ia32: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size __bzero_ia32, .-__bzero_ia32

# ifdef SHARED
# undef libc_hidden_builtin_def
/* IFUNC doesn't work with the hidden functions in shared library since
they will be called without setting up EBX needed for PLT which is
used by IFUNC. */
# define libc_hidden_builtin_def(name) \
.globl __GI___bzero; __GI___bzero = __bzero_ia32
# endif
#endif

#include "../bzero.S"
Loading

0 comments on commit 3af48cb

Please sign in to comment.