Skip to content
Permalink
Browse files

Add x86-64 memmove with unaligned load/store and rep movsb

Implement x86-64 memmove with unaligned load/store and rep movsb.
Support 16-byte, 32-byte and 64-byte vector register sizes.  When
size <= 8 times of vector register size, there is no check for
address overlap bewteen source and destination.  Since overhead for
overlap check is small when size > 8 times of vector register size,
memcpy is an alias of memmove.

A single file provides 2 implementations of memmove, one with rep movsb
and the other without rep movsb.  They share the same codes when size is
between 2 times of vector register size and REP_MOVSB_THRESHOLD which
is 2KB for 16-byte vector register size and scaled up by large vector
register size.

Key features:

1. Use overlapping load and store to avoid branch.
2. For size <= 8 times of vector register size, load  all sources into
registers and store them together.
3. If there is no address overlap bewteen source and destination, copy
from both ends with 4 times of vector register size at a time.
4. If address of destination > address of source, backward copy 8 times
of vector register size at a time.
5. Otherwise, forward copy 8 times of vector register size at a time.
6. Use rep movsb only for forward copy.  Avoid slow backward rep movsb
by fallbacking to backward copy 8 times of vector register size at a
time.
7. Skip when address of destination == address of source.

	[BZ #19776]
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
	memmove-avx512-unaligned-erms.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Test
	__memmove_chk_avx512_unaligned_2,
	__memmove_chk_avx512_unaligned_erms,
	__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
	__memmove_chk_sse2_unaligned_2,
	__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
	__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
	__memmove_avx512_unaligned_erms, __memmove_erms,
	__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
	__memcpy_chk_avx512_unaligned_2,
	__memcpy_chk_avx512_unaligned_erms,
	__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
	__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
	__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
	__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
	__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
	__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
	__mempcpy_chk_avx512_unaligned_erms,
	__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
	__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
	__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
	__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
	__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
	__mempcpy_erms.
	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
	file.
	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
	Likwise.
	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
	Likwise.
	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
	Likwise.
  • Loading branch information...
H.J. Lu
H.J. Lu committed Mar 31, 2016
1 parent 5cdd198 commit 88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb
@@ -1,3 +1,43 @@
2016-03-31 H.J. Lu <hongjiu.lu@intel.com>

[BZ #19776]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
memmove-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test
__memmove_chk_avx512_unaligned_2,
__memmove_chk_avx512_unaligned_erms,
__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
__memmove_chk_sse2_unaligned_2,
__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
__memmove_avx512_unaligned_erms, __memmove_erms,
__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
__memcpy_chk_avx512_unaligned_2,
__memcpy_chk_avx512_unaligned_erms,
__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
__mempcpy_chk_avx512_unaligned_erms,
__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
__mempcpy_erms.
* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likwise.

2016-03-31 Stefan Liebler <stli@linux.vnet.ibm.com>

* sysdeps/s390/bits/link.h: (La_s390_vr) New typedef.
@@ -20,7 +20,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
memset-avx512-no-vzeroupper
memset-avx512-no-vzeroupper \
memmove-sse2-unaligned-erms \
memmove-avx-unaligned-erms \
memmove-avx512-unaligned-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
@@ -52,16 +52,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_unaligned_2)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned_2)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned_2)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))

@@ -70,15 +86,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned_2)
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_unaligned_2)
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned_2)
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))

/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
@@ -267,16 +300,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_unaligned_2)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned_2)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned_2)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))

@@ -285,6 +334,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned_2)
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -293,8 +348,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_unaligned_2)
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
__memcpy_sse2_unaligned_2)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
__memcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))

/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
@@ -303,16 +369,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_unaligned_2)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned_2)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned_2)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))

@@ -322,14 +404,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_unaligned_2)
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned_2)
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned_2)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))

/* Support sysdeps/x86_64/multiarch/strncmp.S. */
@@ -0,0 +1,9 @@
#define VEC_SIZE 32
#define VEC(i) ymm##i
#define VMOVU vmovdqu
#define VMOVA vmovdqa

#define SECTION(p) p##.avx
#define MEMMOVE_SYMBOL(p,s) p##_avx_##s

#include "memmove-vec-unaligned-erms.S"
@@ -0,0 +1,11 @@
#ifdef HAVE_AVX512_ASM_SUPPORT
# define VEC_SIZE 64
# define VEC(i) zmm##i
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64

# define SECTION(p) p##.avx512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s

# include "memmove-vec-unaligned-erms.S"
#endif
@@ -0,0 +1,9 @@
#define VEC_SIZE 16
#define VEC(i) xmm##i
#define VMOVU movdqu
#define VMOVA movdqa

#define SECTION(p) p
#define MEMMOVE_SYMBOL(p,s) p##_sse2_##s

#include "memmove-vec-unaligned-erms.S"

0 comments on commit 88b57b8

Please sign in to comment.
You can’t perform that action at this time.