Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
X86-64: Prepare memmove-vec-unaligned-erms.S
Prepare memmove-vec-unaligned-erms.S to make the SSE2 version as the
default memcpy, mempcpy and memmove.

	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
	(MEMCPY_SYMBOL): New.
	(MEMPCPY_SYMBOL): Likewise.
	(MEMMOVE_CHK_SYMBOL): Likewise.
	Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
	symbols.  Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
	__mempcpy symbols.  Provide alias for __memcpy_chk in libc.a.
	Provide alias for memcpy in libc.a and ld.so.
  • Loading branch information
H.J. Lu committed Apr 6, 2016
1 parent 4af1bb0 commit a7d1c51
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 54 deletions.
11 changes: 11 additions & 0 deletions ChangeLog
@@ -1,3 +1,14 @@
2016-04-06 H.J. Lu <hongjiu.lu@intel.com>

* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
(MEMCPY_SYMBOL): New.
(MEMPCPY_SYMBOL): Likewise.
(MEMMOVE_CHK_SYMBOL): Likewise.
Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
__mempcpy symbols. Provide alias for __memcpy_chk in libc.a.
Provide alias for memcpy in libc.a and ld.so.

2016-04-06 H.J. Lu <hongjiu.lu@intel.com>

* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
Expand Down
138 changes: 84 additions & 54 deletions sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
Expand Up @@ -32,18 +32,27 @@
8 * VEC_SIZE at a time.
8. Otherwise, forward copy 8 * VEC_SIZE at a time. */

#if IS_IN (libc)
#include <sysdep.h>

# include <sysdep.h>
# include "asm-syntax.h"
#ifndef MEMCPY_SYMBOL
# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif

# ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
# else
# define VZEROUPPER
# endif
#ifndef MEMPCPY_SYMBOL
# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif

#ifndef MEMMOVE_CHK_SYMBOL
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif

#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
# else
# define VZEROUPPER
# endif
#endif

/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
up REP MOVSB operation, REP MOVSB isn't faster on short data. The
Expand All @@ -52,32 +61,36 @@
on processors with Enhanced REP MOVSB. Since larger register size
can move more data with a single load and store, the threshold is
higher with larger register size. */
# ifndef REP_MOVSB_THRESHOLD
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
# endif
#ifndef REP_MOVSB_THRESHOLD
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
#endif

# ifndef SECTION
# error SECTION is not defined!
# endif
.section SECTION(.text),"ax",@progbits
#ifndef SECTION
# error SECTION is not defined!
#endif

# ifdef SHARED
ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
.section SECTION(.text),"ax",@progbits
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
#endif

ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
#if VEC_SIZE == 16 || defined SHARED
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
movq %rdi, %rax
addq %rdx, %rax
jmp L(start)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
#endif

ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
# endif
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
#endif

ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
movq %rdi, %rax
Expand All @@ -86,24 +99,29 @@ L(start):
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(last_2x_vec):
#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
VZEROUPPER
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(nop):
#endif
ret
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned_2))

# if VEC_SIZE == 16
# if VEC_SIZE == 16 && defined SHARED
/* Only used to measure performance of REP MOVSB. */
# ifdef SHARED
ENTRY (__mempcpy_erms)
movq %rdi, %rax
addq %rdx, %rax
jmp L(start_movsb)
END (__mempcpy_erms)
# endif

ENTRY (__memmove_erms)
movq %rdi, %rax
Expand Down Expand Up @@ -132,23 +150,21 @@ strong_alias (__memmove_erms, __memcpy_erms)
# endif

# ifdef SHARED
ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))

ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
movq %rdi, %rax
addq %rdx, %rax
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))

# ifdef SHARED
ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif

ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
Expand Down Expand Up @@ -192,6 +208,7 @@ L(movsb_more_2x_vec):
/* Force 32-bit displacement to avoid long nop between
instructions. */
ja.d32 L(movsb)
#endif
.p2align 4
L(more_2x_vec):
/* More than 2 * VEC. */
Expand Down Expand Up @@ -227,13 +244,19 @@ L(copy_forward):
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
cmpq $(VEC_SIZE * 8), %rdx
# if VEC_SIZE == 16
#if VEC_SIZE == 16
# if defined USE_MULTIARCH && IS_IN (libc)
jbe L(return)
# else
/* Use 32-bit displacement to avoid long nop between
instructions. */
jbe.d32 L(return)
# endif
#else
/* Use 8-bit displacement to avoid long nop between
instructions. */
jbe L(return_disp8)
# endif
#endif
leaq (VEC_SIZE * 4)(%rdi), %rcx
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
Expand Down Expand Up @@ -263,22 +286,25 @@ L(loop):
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(return):
#endif
L(return_disp8):
VZEROUPPER
ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
# endif
# if VEC_SIZE > 32
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
#if VEC_SIZE > 32
cmpb $32, %dl
jae L(between_32_63)
# endif
# if VEC_SIZE > 16
#endif
#if VEC_SIZE > 16
cmpb $16, %dl
jae L(between_16_31)
# endif
#endif
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
Expand All @@ -290,7 +316,7 @@ L(less_vec):
movb %cl, (%rdi)
1:
ret
# if VEC_SIZE > 32
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
vmovdqu (%rsi), %ymm0
Expand All @@ -299,16 +325,16 @@ L(between_32_63):
vmovdqu %ymm1, -32(%rdi,%rdx)
VZEROUPPER
ret
# endif
# if VEC_SIZE > 16
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi,%rdx), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -16(%rdi,%rdx)
ret
# endif
#endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
movq -8(%rsi,%rdx), %rcx
Expand All @@ -331,10 +357,10 @@ L(between_2_3):
movw %si, (%rdi)
ret

# if VEC_SIZE > 16
#if VEC_SIZE > 16
/* Align to 16 bytes to avoid long nop between instructions. */
.p2align 4
# endif
#endif
L(more_2x_vec_overlap):
/* More than 2 * VEC and there is overlap bewteen destination
and source. */
Expand Down Expand Up @@ -454,15 +480,19 @@ L(loop_8x_vec_backward):
jmp L(between_4x_vec_and_8x_vec)
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))

# ifdef SHARED
#ifdef SHARED
# if IS_IN (libc)
# ifdef USE_MULTIARCH
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
MEMMOVE_SYMBOL (__memcpy, unaligned_2))
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
# endif
strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
# endif

#endif
#if VEC_SIZE == 16 || defined SHARED
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
MEMCPY_SYMBOL (__memcpy, unaligned_2))
#endif

0 comments on commit a7d1c51

Please sign in to comment.