Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
glibc/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
424 lines (396 sloc)
9.68 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* memcpy optimized with AVX512 for KNL hardware. | |
Copyright (C) 2016 Free Software Foundation, Inc. | |
This file is part of the GNU C Library. | |
The GNU C Library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2.1 of the License, or (at your option) any later version. | |
The GNU C Library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with the GNU C Library; if not, see | |
<http://www.gnu.org/licenses/>. */ | |
#include <sysdep.h> | |
#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \ | |
&& (defined SHARED \ | |
|| defined USE_AS_MEMMOVE \ | |
|| !defined USE_MULTIARCH) | |
#include "asm-syntax.h" | |
#ifndef MEMCPY | |
# define MEMCPY __memcpy_avx512_no_vzeroupper | |
# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper | |
# define MEMPCPY __mempcpy_avx512_no_vzeroupper | |
# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper | |
#endif | |
.section .text.avx512,"ax",@progbits | |
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE | |
ENTRY (MEMPCPY_CHK) | |
cmpq %rdx, %rcx | |
jb HIDDEN_JUMPTARGET (__chk_fail) | |
END (MEMPCPY_CHK) | |
ENTRY (MEMPCPY) | |
movq %rdi, %rax | |
addq %rdx, %rax | |
jmp L(start) | |
END (MEMPCPY) | |
#endif | |
#if !defined USE_AS_BCOPY | |
ENTRY (MEMCPY_CHK) | |
cmpq %rdx, %rcx | |
jb HIDDEN_JUMPTARGET (__chk_fail) | |
END (MEMCPY_CHK) | |
#endif | |
ENTRY (MEMCPY) | |
mov %rdi, %rax | |
#ifdef USE_AS_MEMPCPY | |
add %rdx, %rax | |
#endif | |
L(start): | |
lea (%rsi, %rdx), %rcx | |
lea (%rdi, %rdx), %r9 | |
cmp $512, %rdx | |
ja L(512bytesormore) | |
L(check): | |
cmp $16, %rdx | |
jbe L(less_16bytes) | |
cmp $256, %rdx | |
jb L(less_256bytes) | |
vmovups (%rsi), %zmm0 | |
vmovups 0x40(%rsi), %zmm1 | |
vmovups 0x80(%rsi), %zmm2 | |
vmovups 0xC0(%rsi), %zmm3 | |
vmovups -0x100(%rcx), %zmm4 | |
vmovups -0xC0(%rcx), %zmm5 | |
vmovups -0x80(%rcx), %zmm6 | |
vmovups -0x40(%rcx), %zmm7 | |
vmovups %zmm0, (%rdi) | |
vmovups %zmm1, 0x40(%rdi) | |
vmovups %zmm2, 0x80(%rdi) | |
vmovups %zmm3, 0xC0(%rdi) | |
vmovups %zmm4, -0x100(%r9) | |
vmovups %zmm5, -0xC0(%r9) | |
vmovups %zmm6, -0x80(%r9) | |
vmovups %zmm7, -0x40(%r9) | |
ret | |
L(less_256bytes): | |
cmp $128, %dl | |
jb L(less_128bytes) | |
vmovups (%rsi), %zmm0 | |
vmovups 0x40(%rsi), %zmm1 | |
vmovups -0x80(%rcx), %zmm2 | |
vmovups -0x40(%rcx), %zmm3 | |
vmovups %zmm0, (%rdi) | |
vmovups %zmm1, 0x40(%rdi) | |
vmovups %zmm2, -0x80(%r9) | |
vmovups %zmm3, -0x40(%r9) | |
ret | |
L(less_128bytes): | |
cmp $64, %dl | |
jb L(less_64bytes) | |
vmovdqu (%rsi), %ymm0 | |
vmovdqu 0x20(%rsi), %ymm1 | |
vmovdqu -0x40(%rcx), %ymm2 | |
vmovdqu -0x20(%rcx), %ymm3 | |
vmovdqu %ymm0, (%rdi) | |
vmovdqu %ymm1, 0x20(%rdi) | |
vmovdqu %ymm2, -0x40(%r9) | |
vmovdqu %ymm3, -0x20(%r9) | |
ret | |
L(less_64bytes): | |
cmp $32, %dl | |
jb L(less_32bytes) | |
vmovdqu (%rsi), %ymm0 | |
vmovdqu -0x20(%rcx), %ymm1 | |
vmovdqu %ymm0, (%rdi) | |
vmovdqu %ymm1, -0x20(%r9) | |
ret | |
L(less_32bytes): | |
vmovdqu (%rsi), %xmm0 | |
vmovdqu -0x10(%rcx), %xmm1 | |
vmovdqu %xmm0, (%rdi) | |
vmovdqu %xmm1, -0x10(%r9) | |
ret | |
L(less_16bytes): | |
cmp $8, %dl | |
jb L(less_8bytes) | |
movq (%rsi), %rsi | |
movq -0x8(%rcx), %rcx | |
movq %rsi, (%rdi) | |
movq %rcx, -0x8(%r9) | |
ret | |
L(less_8bytes): | |
cmp $4, %dl | |
jb L(less_4bytes) | |
mov (%rsi), %esi | |
mov -0x4(%rcx), %ecx | |
mov %esi, (%rdi) | |
mov %ecx, -0x4(%r9) | |
ret | |
L(less_4bytes): | |
cmp $2, %dl | |
jb L(less_2bytes) | |
mov (%rsi), %si | |
mov -0x2(%rcx), %cx | |
mov %si, (%rdi) | |
mov %cx, -0x2(%r9) | |
ret | |
L(less_2bytes): | |
cmp $1, %dl | |
jb L(less_1bytes) | |
mov (%rsi), %cl | |
mov %cl, (%rdi) | |
L(less_1bytes): | |
ret | |
L(512bytesormore): | |
#ifdef SHARED_CACHE_SIZE_HALF | |
mov $SHARED_CACHE_SIZE_HALF, %r8 | |
#else | |
mov __x86_shared_cache_size_half(%rip), %r8 | |
#endif | |
cmp %r8, %rdx | |
jae L(preloop_large) | |
cmp $1024, %rdx | |
ja L(1024bytesormore) | |
prefetcht1 (%rsi) | |
prefetcht1 0x40(%rsi) | |
prefetcht1 0x80(%rsi) | |
prefetcht1 0xC0(%rsi) | |
prefetcht1 0x100(%rsi) | |
prefetcht1 0x140(%rsi) | |
prefetcht1 0x180(%rsi) | |
prefetcht1 0x1C0(%rsi) | |
prefetcht1 -0x200(%rcx) | |
prefetcht1 -0x1C0(%rcx) | |
prefetcht1 -0x180(%rcx) | |
prefetcht1 -0x140(%rcx) | |
prefetcht1 -0x100(%rcx) | |
prefetcht1 -0xC0(%rcx) | |
prefetcht1 -0x80(%rcx) | |
prefetcht1 -0x40(%rcx) | |
vmovups (%rsi), %zmm0 | |
vmovups 0x40(%rsi), %zmm1 | |
vmovups 0x80(%rsi), %zmm2 | |
vmovups 0xC0(%rsi), %zmm3 | |
vmovups 0x100(%rsi), %zmm4 | |
vmovups 0x140(%rsi), %zmm5 | |
vmovups 0x180(%rsi), %zmm6 | |
vmovups 0x1C0(%rsi), %zmm7 | |
vmovups -0x200(%rcx), %zmm8 | |
vmovups -0x1C0(%rcx), %zmm9 | |
vmovups -0x180(%rcx), %zmm10 | |
vmovups -0x140(%rcx), %zmm11 | |
vmovups -0x100(%rcx), %zmm12 | |
vmovups -0xC0(%rcx), %zmm13 | |
vmovups -0x80(%rcx), %zmm14 | |
vmovups -0x40(%rcx), %zmm15 | |
vmovups %zmm0, (%rdi) | |
vmovups %zmm1, 0x40(%rdi) | |
vmovups %zmm2, 0x80(%rdi) | |
vmovups %zmm3, 0xC0(%rdi) | |
vmovups %zmm4, 0x100(%rdi) | |
vmovups %zmm5, 0x140(%rdi) | |
vmovups %zmm6, 0x180(%rdi) | |
vmovups %zmm7, 0x1C0(%rdi) | |
vmovups %zmm8, -0x200(%r9) | |
vmovups %zmm9, -0x1C0(%r9) | |
vmovups %zmm10, -0x180(%r9) | |
vmovups %zmm11, -0x140(%r9) | |
vmovups %zmm12, -0x100(%r9) | |
vmovups %zmm13, -0xC0(%r9) | |
vmovups %zmm14, -0x80(%r9) | |
vmovups %zmm15, -0x40(%r9) | |
ret | |
L(1024bytesormore): | |
cmp %rsi, %rdi | |
ja L(1024bytesormore_bkw) | |
sub $512, %r9 | |
vmovups -0x200(%rcx), %zmm8 | |
vmovups -0x1C0(%rcx), %zmm9 | |
vmovups -0x180(%rcx), %zmm10 | |
vmovups -0x140(%rcx), %zmm11 | |
vmovups -0x100(%rcx), %zmm12 | |
vmovups -0xC0(%rcx), %zmm13 | |
vmovups -0x80(%rcx), %zmm14 | |
vmovups -0x40(%rcx), %zmm15 | |
prefetcht1 (%rsi) | |
prefetcht1 0x40(%rsi) | |
prefetcht1 0x80(%rsi) | |
prefetcht1 0xC0(%rsi) | |
prefetcht1 0x100(%rsi) | |
prefetcht1 0x140(%rsi) | |
prefetcht1 0x180(%rsi) | |
prefetcht1 0x1C0(%rsi) | |
/* Loop with unaligned memory access. */ | |
L(gobble_512bytes_loop): | |
vmovups (%rsi), %zmm0 | |
vmovups 0x40(%rsi), %zmm1 | |
vmovups 0x80(%rsi), %zmm2 | |
vmovups 0xC0(%rsi), %zmm3 | |
vmovups 0x100(%rsi), %zmm4 | |
vmovups 0x140(%rsi), %zmm5 | |
vmovups 0x180(%rsi), %zmm6 | |
vmovups 0x1C0(%rsi), %zmm7 | |
add $512, %rsi | |
prefetcht1 (%rsi) | |
prefetcht1 0x40(%rsi) | |
prefetcht1 0x80(%rsi) | |
prefetcht1 0xC0(%rsi) | |
prefetcht1 0x100(%rsi) | |
prefetcht1 0x140(%rsi) | |
prefetcht1 0x180(%rsi) | |
prefetcht1 0x1C0(%rsi) | |
vmovups %zmm0, (%rdi) | |
vmovups %zmm1, 0x40(%rdi) | |
vmovups %zmm2, 0x80(%rdi) | |
vmovups %zmm3, 0xC0(%rdi) | |
vmovups %zmm4, 0x100(%rdi) | |
vmovups %zmm5, 0x140(%rdi) | |
vmovups %zmm6, 0x180(%rdi) | |
vmovups %zmm7, 0x1C0(%rdi) | |
add $512, %rdi | |
cmp %r9, %rdi | |
jb L(gobble_512bytes_loop) | |
vmovups %zmm8, (%r9) | |
vmovups %zmm9, 0x40(%r9) | |
vmovups %zmm10, 0x80(%r9) | |
vmovups %zmm11, 0xC0(%r9) | |
vmovups %zmm12, 0x100(%r9) | |
vmovups %zmm13, 0x140(%r9) | |
vmovups %zmm14, 0x180(%r9) | |
vmovups %zmm15, 0x1C0(%r9) | |
ret | |
L(1024bytesormore_bkw): | |
add $512, %rdi | |
vmovups 0x1C0(%rsi), %zmm8 | |
vmovups 0x180(%rsi), %zmm9 | |
vmovups 0x140(%rsi), %zmm10 | |
vmovups 0x100(%rsi), %zmm11 | |
vmovups 0xC0(%rsi), %zmm12 | |
vmovups 0x80(%rsi), %zmm13 | |
vmovups 0x40(%rsi), %zmm14 | |
vmovups (%rsi), %zmm15 | |
prefetcht1 -0x40(%rcx) | |
prefetcht1 -0x80(%rcx) | |
prefetcht1 -0xC0(%rcx) | |
prefetcht1 -0x100(%rcx) | |
prefetcht1 -0x140(%rcx) | |
prefetcht1 -0x180(%rcx) | |
prefetcht1 -0x1C0(%rcx) | |
prefetcht1 -0x200(%rcx) | |
/* Backward loop with unaligned memory access. */ | |
L(gobble_512bytes_loop_bkw): | |
vmovups -0x40(%rcx), %zmm0 | |
vmovups -0x80(%rcx), %zmm1 | |
vmovups -0xC0(%rcx), %zmm2 | |
vmovups -0x100(%rcx), %zmm3 | |
vmovups -0x140(%rcx), %zmm4 | |
vmovups -0x180(%rcx), %zmm5 | |
vmovups -0x1C0(%rcx), %zmm6 | |
vmovups -0x200(%rcx), %zmm7 | |
sub $512, %rcx | |
prefetcht1 -0x40(%rcx) | |
prefetcht1 -0x80(%rcx) | |
prefetcht1 -0xC0(%rcx) | |
prefetcht1 -0x100(%rcx) | |
prefetcht1 -0x140(%rcx) | |
prefetcht1 -0x180(%rcx) | |
prefetcht1 -0x1C0(%rcx) | |
prefetcht1 -0x200(%rcx) | |
vmovups %zmm0, -0x40(%r9) | |
vmovups %zmm1, -0x80(%r9) | |
vmovups %zmm2, -0xC0(%r9) | |
vmovups %zmm3, -0x100(%r9) | |
vmovups %zmm4, -0x140(%r9) | |
vmovups %zmm5, -0x180(%r9) | |
vmovups %zmm6, -0x1C0(%r9) | |
vmovups %zmm7, -0x200(%r9) | |
sub $512, %r9 | |
cmp %rdi, %r9 | |
ja L(gobble_512bytes_loop_bkw) | |
vmovups %zmm8, -0x40(%rdi) | |
vmovups %zmm9, -0x80(%rdi) | |
vmovups %zmm10, -0xC0(%rdi) | |
vmovups %zmm11, -0x100(%rdi) | |
vmovups %zmm12, -0x140(%rdi) | |
vmovups %zmm13, -0x180(%rdi) | |
vmovups %zmm14, -0x1C0(%rdi) | |
vmovups %zmm15, -0x200(%rdi) | |
ret | |
L(preloop_large): | |
cmp %rsi, %rdi | |
ja L(preloop_large_bkw) | |
vmovups (%rsi), %zmm4 | |
vmovups 0x40(%rsi), %zmm5 | |
/* Align destination for access with non-temporal stores in the loop. */ | |
mov %rdi, %r8 | |
and $-0x80, %rdi | |
add $0x80, %rdi | |
sub %rdi, %r8 | |
sub %r8, %rsi | |
add %r8, %rdx | |
L(gobble_256bytes_nt_loop): | |
prefetcht1 0x200(%rsi) | |
prefetcht1 0x240(%rsi) | |
prefetcht1 0x280(%rsi) | |
prefetcht1 0x2C0(%rsi) | |
prefetcht1 0x300(%rsi) | |
prefetcht1 0x340(%rsi) | |
prefetcht1 0x380(%rsi) | |
prefetcht1 0x3C0(%rsi) | |
vmovdqu64 (%rsi), %zmm0 | |
vmovdqu64 0x40(%rsi), %zmm1 | |
vmovdqu64 0x80(%rsi), %zmm2 | |
vmovdqu64 0xC0(%rsi), %zmm3 | |
vmovntdq %zmm0, (%rdi) | |
vmovntdq %zmm1, 0x40(%rdi) | |
vmovntdq %zmm2, 0x80(%rdi) | |
vmovntdq %zmm3, 0xC0(%rdi) | |
sub $256, %rdx | |
add $256, %rsi | |
add $256, %rdi | |
cmp $256, %rdx | |
ja L(gobble_256bytes_nt_loop) | |
sfence | |
vmovups %zmm4, (%rax) | |
vmovups %zmm5, 0x40(%rax) | |
jmp L(check) | |
L(preloop_large_bkw): | |
vmovups -0x80(%rcx), %zmm4 | |
vmovups -0x40(%rcx), %zmm5 | |
/* Align end of destination for access with non-temporal stores. */ | |
mov %r9, %r8 | |
and $-0x80, %r9 | |
sub %r9, %r8 | |
sub %r8, %rcx | |
sub %r8, %rdx | |
add %r9, %r8 | |
L(gobble_256bytes_nt_loop_bkw): | |
prefetcht1 -0x400(%rcx) | |
prefetcht1 -0x3C0(%rcx) | |
prefetcht1 -0x380(%rcx) | |
prefetcht1 -0x340(%rcx) | |
prefetcht1 -0x300(%rcx) | |
prefetcht1 -0x2C0(%rcx) | |
prefetcht1 -0x280(%rcx) | |
prefetcht1 -0x240(%rcx) | |
vmovdqu64 -0x100(%rcx), %zmm0 | |
vmovdqu64 -0xC0(%rcx), %zmm1 | |
vmovdqu64 -0x80(%rcx), %zmm2 | |
vmovdqu64 -0x40(%rcx), %zmm3 | |
vmovntdq %zmm0, -0x100(%r9) | |
vmovntdq %zmm1, -0xC0(%r9) | |
vmovntdq %zmm2, -0x80(%r9) | |
vmovntdq %zmm3, -0x40(%r9) | |
sub $256, %rdx | |
sub $256, %rcx | |
sub $256, %r9 | |
cmp $256, %rdx | |
ja L(gobble_256bytes_nt_loop_bkw) | |
sfence | |
vmovups %zmm4, -0x80(%r8) | |
vmovups %zmm5, -0x40(%r8) | |
jmp L(check) | |
END (MEMCPY) | |
#endif |