Permalink
Cannot retrieve contributors at this time
glibc/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
Go to file/* strstr with unaligned loads | |
Copyright (C) 2009-2016 Free Software Foundation, Inc. | |
This file is part of the GNU C Library. | |
The GNU C Library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2.1 of the License, or (at your option) any later version. | |
The GNU C Library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with the GNU C Library; if not, see | |
<http://www.gnu.org/licenses/>. */ | |
#include <sysdep.h> | |
ENTRY(__strstr_sse2_unaligned) | |
movzbl (%rsi), %eax | |
testb %al, %al | |
je L(empty) | |
movzbl 1(%rsi), %edx | |
testb %dl, %dl | |
je L(strchr) | |
movd %eax, %xmm1 | |
movd %edx, %xmm2 | |
movq %rdi, %rax | |
andl $4095, %eax | |
punpcklbw %xmm1, %xmm1 | |
cmpq $4031, %rax | |
punpcklbw %xmm2, %xmm2 | |
punpcklwd %xmm1, %xmm1 | |
punpcklwd %xmm2, %xmm2 | |
pshufd $0, %xmm1, %xmm1 | |
pshufd $0, %xmm2, %xmm2 | |
ja L(cross_page) | |
movdqu (%rdi), %xmm3 | |
pxor %xmm5, %xmm5 | |
movdqu 1(%rdi), %xmm4 | |
movdqa %xmm3, %xmm6 | |
pcmpeqb %xmm1, %xmm3 | |
pcmpeqb %xmm2, %xmm4 | |
movdqu 16(%rdi), %xmm0 | |
pcmpeqb %xmm5, %xmm6 | |
pminub %xmm4, %xmm3 | |
movdqa %xmm3, %xmm4 | |
movdqu 17(%rdi), %xmm3 | |
pcmpeqb %xmm0, %xmm5 | |
pcmpeqb %xmm2, %xmm3 | |
por %xmm6, %xmm4 | |
pcmpeqb %xmm1, %xmm0 | |
pminub %xmm3, %xmm0 | |
por %xmm5, %xmm0 | |
pmovmskb %xmm4, %r8d | |
pmovmskb %xmm0, %eax | |
salq $16, %rax | |
orq %rax, %r8 | |
je L(next_32_bytes) | |
L(next_pair_index): | |
bsf %r8, %rax | |
addq %rdi, %rax | |
cmpb $0, (%rax) | |
je L(zero1) | |
movzbl 2(%rsi), %edx | |
testb %dl, %dl | |
je L(found1) | |
cmpb 2(%rax), %dl | |
jne L(next_pair) | |
xorl %edx, %edx | |
jmp L(pair_loop_start) | |
.p2align 4 | |
L(strchr): | |
movzbl %al, %esi | |
jmp __strchr_sse2 | |
.p2align 4 | |
L(pair_loop): | |
addq $1, %rdx | |
cmpb 2(%rax,%rdx), %cl | |
jne L(next_pair) | |
L(pair_loop_start): | |
movzbl 3(%rsi,%rdx), %ecx | |
testb %cl, %cl | |
jne L(pair_loop) | |
L(found1): | |
ret | |
L(zero1): | |
xorl %eax, %eax | |
ret | |
.p2align 4 | |
L(next_pair): | |
leaq -1(%r8), %rax | |
andq %rax, %r8 | |
jne L(next_pair_index) | |
.p2align 4 | |
L(next_32_bytes): | |
movdqu 32(%rdi), %xmm3 | |
pxor %xmm5, %xmm5 | |
movdqu 33(%rdi), %xmm4 | |
movdqa %xmm3, %xmm6 | |
pcmpeqb %xmm1, %xmm3 | |
pcmpeqb %xmm2, %xmm4 | |
movdqu 48(%rdi), %xmm0 | |
pcmpeqb %xmm5, %xmm6 | |
pminub %xmm4, %xmm3 | |
movdqa %xmm3, %xmm4 | |
movdqu 49(%rdi), %xmm3 | |
pcmpeqb %xmm0, %xmm5 | |
pcmpeqb %xmm2, %xmm3 | |
por %xmm6, %xmm4 | |
pcmpeqb %xmm1, %xmm0 | |
pminub %xmm3, %xmm0 | |
por %xmm5, %xmm0 | |
pmovmskb %xmm4, %eax | |
salq $32, %rax | |
pmovmskb %xmm0, %r8d | |
salq $48, %r8 | |
orq %rax, %r8 | |
je L(loop_header) | |
L(next_pair2_index): | |
bsfq %r8, %rax | |
addq %rdi, %rax | |
cmpb $0, (%rax) | |
je L(zero2) | |
movzbl 2(%rsi), %edx | |
testb %dl, %dl | |
je L(found2) | |
cmpb 2(%rax), %dl | |
jne L(next_pair2) | |
xorl %edx, %edx | |
jmp L(pair_loop2_start) | |
.p2align 4 | |
L(pair_loop2): | |
addq $1, %rdx | |
cmpb 2(%rax,%rdx), %cl | |
jne L(next_pair2) | |
L(pair_loop2_start): | |
movzbl 3(%rsi,%rdx), %ecx | |
testb %cl, %cl | |
jne L(pair_loop2) | |
L(found2): | |
ret | |
L(zero2): | |
xorl %eax, %eax | |
ret | |
L(empty): | |
mov %rdi, %rax | |
ret | |
.p2align 4 | |
L(next_pair2): | |
leaq -1(%r8), %rax | |
andq %rax, %r8 | |
jne L(next_pair2_index) | |
L(loop_header): | |
movq $-512, %r11 | |
movq %rdi, %r9 | |
pxor %xmm7, %xmm7 | |
andq $-64, %rdi | |
.p2align 4 | |
L(loop): | |
movdqa 64(%rdi), %xmm3 | |
movdqu 63(%rdi), %xmm6 | |
movdqa %xmm3, %xmm0 | |
pxor %xmm2, %xmm3 | |
pxor %xmm1, %xmm6 | |
movdqa 80(%rdi), %xmm10 | |
por %xmm3, %xmm6 | |
pminub %xmm10, %xmm0 | |
movdqu 79(%rdi), %xmm3 | |
pxor %xmm2, %xmm10 | |
pxor %xmm1, %xmm3 | |
movdqa 96(%rdi), %xmm9 | |
por %xmm10, %xmm3 | |
pminub %xmm9, %xmm0 | |
pxor %xmm2, %xmm9 | |
movdqa 112(%rdi), %xmm8 | |
addq $64, %rdi | |
pminub %xmm6, %xmm3 | |
movdqu 31(%rdi), %xmm4 | |
pminub %xmm8, %xmm0 | |
pxor %xmm2, %xmm8 | |
pxor %xmm1, %xmm4 | |
por %xmm9, %xmm4 | |
pminub %xmm4, %xmm3 | |
movdqu 47(%rdi), %xmm5 | |
pxor %xmm1, %xmm5 | |
por %xmm8, %xmm5 | |
pminub %xmm5, %xmm3 | |
pminub %xmm3, %xmm0 | |
pcmpeqb %xmm7, %xmm0 | |
pmovmskb %xmm0, %eax | |
testl %eax, %eax | |
je L(loop) | |
pminub (%rdi), %xmm6 | |
pminub 32(%rdi),%xmm4 | |
pminub 48(%rdi),%xmm5 | |
pcmpeqb %xmm7, %xmm6 | |
pcmpeqb %xmm7, %xmm5 | |
pmovmskb %xmm6, %edx | |
movdqa 16(%rdi), %xmm8 | |
pcmpeqb %xmm7, %xmm4 | |
movdqu 15(%rdi), %xmm0 | |
pmovmskb %xmm5, %r8d | |
movdqa %xmm8, %xmm3 | |
pmovmskb %xmm4, %ecx | |
pcmpeqb %xmm1,%xmm0 | |
pcmpeqb %xmm2,%xmm3 | |
salq $32, %rcx | |
pcmpeqb %xmm7,%xmm8 | |
salq $48, %r8 | |
pminub %xmm0,%xmm3 | |
orq %rcx, %rdx | |
por %xmm3,%xmm8 | |
orq %rdx, %r8 | |
pmovmskb %xmm8, %eax | |
salq $16, %rax | |
orq %rax, %r8 | |
je L(loop) | |
L(next_pair_index3): | |
bsfq %r8, %rcx | |
addq %rdi, %rcx | |
cmpb $0, (%rcx) | |
je L(zero) | |
xorl %eax, %eax | |
movzbl 2(%rsi), %edx | |
testb %dl, %dl | |
je L(success3) | |
cmpb 1(%rcx), %dl | |
jne L(next_pair3) | |
jmp L(pair_loop_start3) | |
.p2align 4 | |
L(pair_loop3): | |
addq $1, %rax | |
cmpb 1(%rcx,%rax), %dl | |
jne L(next_pair3) | |
L(pair_loop_start3): | |
movzbl 3(%rsi,%rax), %edx | |
testb %dl, %dl | |
jne L(pair_loop3) | |
L(success3): | |
lea -1(%rcx), %rax | |
ret | |
.p2align 4 | |
L(next_pair3): | |
addq %rax, %r11 | |
movq %rdi, %rax | |
subq %r9, %rax | |
cmpq %r11, %rax | |
jl L(switch_strstr) | |
leaq -1(%r8), %rax | |
andq %rax, %r8 | |
jne L(next_pair_index3) | |
jmp L(loop) | |
.p2align 4 | |
L(switch_strstr): | |
movq %rdi, %rdi | |
jmp __strstr_sse2 | |
.p2align 4 | |
L(cross_page): | |
movq %rdi, %rax | |
pxor %xmm0, %xmm0 | |
andq $-64, %rax | |
movdqa (%rax), %xmm3 | |
movdqu -1(%rax), %xmm4 | |
movdqa %xmm3, %xmm8 | |
movdqa 16(%rax), %xmm5 | |
pcmpeqb %xmm1, %xmm4 | |
pcmpeqb %xmm0, %xmm8 | |
pcmpeqb %xmm2, %xmm3 | |
movdqa %xmm5, %xmm7 | |
pminub %xmm4, %xmm3 | |
movdqu 15(%rax), %xmm4 | |
pcmpeqb %xmm0, %xmm7 | |
por %xmm3, %xmm8 | |
movdqa %xmm5, %xmm3 | |
movdqa 32(%rax), %xmm5 | |
pcmpeqb %xmm1, %xmm4 | |
pcmpeqb %xmm2, %xmm3 | |
movdqa %xmm5, %xmm6 | |
pmovmskb %xmm8, %ecx | |
pminub %xmm4, %xmm3 | |
movdqu 31(%rax), %xmm4 | |
por %xmm3, %xmm7 | |
movdqa %xmm5, %xmm3 | |
pcmpeqb %xmm0, %xmm6 | |
movdqa 48(%rax), %xmm5 | |
pcmpeqb %xmm1, %xmm4 | |
pmovmskb %xmm7, %r8d | |
pcmpeqb %xmm2, %xmm3 | |
pcmpeqb %xmm5, %xmm0 | |
pminub %xmm4, %xmm3 | |
movdqu 47(%rax), %xmm4 | |
por %xmm3, %xmm6 | |
movdqa %xmm5, %xmm3 | |
salq $16, %r8 | |
pcmpeqb %xmm1, %xmm4 | |
pcmpeqb %xmm2, %xmm3 | |
pmovmskb %xmm6, %r10d | |
pminub %xmm4, %xmm3 | |
por %xmm3, %xmm0 | |
salq $32, %r10 | |
orq %r10, %r8 | |
orq %rcx, %r8 | |
movl %edi, %ecx | |
pmovmskb %xmm0, %edx | |
subl %eax, %ecx | |
salq $48, %rdx | |
orq %rdx, %r8 | |
shrq %cl, %r8 | |
je L(loop_header) | |
L(next_pair_index4): | |
bsfq %r8, %rax | |
addq %rdi, %rax | |
cmpb $0, (%rax) | |
je L(zero) | |
cmpq %rax,%rdi | |
je L(next_pair4) | |
movzbl 2(%rsi), %edx | |
testb %dl, %dl | |
je L(found3) | |
cmpb 1(%rax), %dl | |
jne L(next_pair4) | |
xorl %edx, %edx | |
jmp L(pair_loop_start4) | |
.p2align 4 | |
L(pair_loop4): | |
addq $1, %rdx | |
cmpb 1(%rax,%rdx), %cl | |
jne L(next_pair4) | |
L(pair_loop_start4): | |
movzbl 3(%rsi,%rdx), %ecx | |
testb %cl, %cl | |
jne L(pair_loop4) | |
L(found3): | |
subq $1, %rax | |
ret | |
.p2align 4 | |
L(next_pair4): | |
leaq -1(%r8), %rax | |
andq %rax, %r8 | |
jne L(next_pair_index4) | |
jmp L(loop_header) | |
.p2align 4 | |
L(found): | |
rep | |
ret | |
.p2align 4 | |
L(zero): | |
xorl %eax, %eax | |
ret | |
END(__strstr_sse2_unaligned) |