From 7eb22e757edecb72754f314c8ffb44350a1dadbc Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 9 Feb 2010 22:34:17 -0800 Subject: [PATCH 01/27] Avoid PLT call to fegetenv on s390 --- ChangeLog | 12 ++++++++++++ include/fenv.h | 1 + malloc/malloc.c | 17 +++++++++++++++-- math/fegetenv.c | 1 + sysdeps/i386/fpu/fegetenv.c | 1 + sysdeps/ia64/fpu/fegetenv.c | 1 + sysdeps/powerpc/fpu/fegetenv.c | 1 + sysdeps/s390/fpu/fegetenv.c | 5 +---- sysdeps/sh/sh4/fpu/fegetenv.c | 1 + sysdeps/sparc/fpu/fegetenv.c | 1 + sysdeps/x86_64/fpu/fegetenv.c | 1 + 11 files changed, 36 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index a68be8002d..9b24c0b4a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2010-02-08 Andreas Schwab + + * include/fenv.h: Add hidden proto for fegetenv. + * math/fegetenv.c: Add hidden alias. + * sysdeps/i386/fpu/fegetenv.c: Likewise. + * sysdeps/ia64/fpu/fegetenv.c: Likewise. + * sysdeps/powerpc/fpu/fegetenv.c: Likewise. + * sysdeps/sh/sh4/fpu/fegetenv.c: Likewise. + * sysdeps/sparc/fpu/fegetenv.c: Likewise. + * sysdeps/x86_64/fpu/fegetenv.c: Likewise + * sysdeps/s390/fpu/fegetenv.c: Likewise. Remove unused headers. + 2010-02-05 H.J. Lu [BZ #11230] diff --git a/include/fenv.h b/include/fenv.h index 3aec7e52bb..254162d45c 100644 --- a/include/fenv.h +++ b/include/fenv.h @@ -13,6 +13,7 @@ extern int __fesetenv (__const fenv_t *__envp); extern int __feupdateenv (__const fenv_t *__envp); libm_hidden_proto (feraiseexcept) +libm_hidden_proto (fegetenv) libm_hidden_proto (fesetenv) libm_hidden_proto (fesetround) libm_hidden_proto (feholdexcept) diff --git a/malloc/malloc.c b/malloc/malloc.c index b43e454f6e..acf1bec500 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1,5 +1,5 @@ /* Malloc implementation for multiple threads without lock contention. - Copyright (C) 1996-2006, 2007, 2008, 2009 Free Software Foundation, Inc. + Copyright (C) 1996-2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Wolfram Gloger and Doug Lea , 2001. @@ -4850,7 +4850,8 @@ _int_free(mstate av, mchunkptr p) free_perturb (chunk2mem(p), size - SIZE_SZ); set_fastchunks(av); - fb = &fastbin (av, fastbin_index(size)); + unsigned int idx = fastbin_index(size); + fb = &fastbin (av, idx); #ifdef ATOMIC_FASTBINS mchunkptr fd; @@ -4864,6 +4865,12 @@ _int_free(mstate av, mchunkptr p) errstr = "double free or corruption (fasttop)"; goto errout; } + if (old != NULL && (chunksize(old) > request2size(MAX_FAST_SIZE) + || fastbin_index(chunksize(old)) != idx)) + { + errstr = "invalid fastbin entry (free)"; + goto errout; + } p->fd = fd = old; } while ((old = catomic_compare_and_exchange_val_rel (fb, p, fd)) != fd); @@ -4875,6 +4882,12 @@ _int_free(mstate av, mchunkptr p) errstr = "double free or corruption (fasttop)"; goto errout; } + if (*fb != NULL && (chunksize(*fb) > request2size(MAX_FAST_SIZE) + || fastbin_index(chunksize(*fb)) != idx)) + { + errstr = "invalid fastbin entry (free)"; + goto errout; + } p->fd = *fb; *fb = p; diff --git a/math/fegetenv.c b/math/fegetenv.c index 4a878cc41b..5b524307db 100644 --- a/math/fegetenv.c +++ b/math/fegetenv.c @@ -32,6 +32,7 @@ __fegetenv (fenv_t *envp) strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2); stub_warning (fegetenv) diff --git a/sysdeps/i386/fpu/fegetenv.c b/sysdeps/i386/fpu/fegetenv.c index fb955cf565..ddb67e5d83 100644 --- a/sysdeps/i386/fpu/fegetenv.c +++ b/sysdeps/i386/fpu/fegetenv.c @@ -40,4 +40,5 @@ strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2); diff --git a/sysdeps/ia64/fpu/fegetenv.c b/sysdeps/ia64/fpu/fegetenv.c index 5446b16494..e240f75e43 100644 --- a/sysdeps/ia64/fpu/fegetenv.c +++ b/sysdeps/ia64/fpu/fegetenv.c @@ -27,3 +27,4 @@ fegetenv (fenv_t *envp) return 0; } +libm_hidden_def (fegetenv) diff --git a/sysdeps/powerpc/fpu/fegetenv.c b/sysdeps/powerpc/fpu/fegetenv.c index 53953454cb..3d21abb529 100644 --- a/sysdeps/powerpc/fpu/fegetenv.c +++ b/sysdeps/powerpc/fpu/fegetenv.c @@ -35,4 +35,5 @@ strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2); diff --git a/sysdeps/s390/fpu/fegetenv.c b/sysdeps/s390/fpu/fegetenv.c index a244f2ca8b..04da54c94c 100644 --- a/sysdeps/s390/fpu/fegetenv.c +++ b/sysdeps/s390/fpu/fegetenv.c @@ -20,10 +20,6 @@ #include #include -#include -#include -#include -#include int fegetenv (fenv_t *envp) @@ -33,3 +29,4 @@ fegetenv (fenv_t *envp) /* Success. */ return 0; } +libm_hidden_def (fegetenv) diff --git a/sysdeps/sh/sh4/fpu/fegetenv.c b/sysdeps/sh/sh4/fpu/fegetenv.c index c07b32af30..683939b52d 100644 --- a/sysdeps/sh/sh4/fpu/fegetenv.c +++ b/sysdeps/sh/sh4/fpu/fegetenv.c @@ -29,3 +29,4 @@ fegetenv (fenv_t *envp) return 0; } +libm_hidden_def (fegetenv) diff --git a/sysdeps/sparc/fpu/fegetenv.c b/sysdeps/sparc/fpu/fegetenv.c index 36486f5973..c606a9cac0 100644 --- a/sysdeps/sparc/fpu/fegetenv.c +++ b/sysdeps/sparc/fpu/fegetenv.c @@ -34,4 +34,5 @@ strong_alias (__fegetenv, __old_fegetenv) compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1); #endif +libm_hidden_ver (__fegetenv, fegetenv) versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2); diff --git a/sysdeps/x86_64/fpu/fegetenv.c b/sysdeps/x86_64/fpu/fegetenv.c index fa5a8dadcb..2159a1fab1 100644 --- a/sysdeps/x86_64/fpu/fegetenv.c +++ b/sysdeps/x86_64/fpu/fegetenv.c @@ -28,3 +28,4 @@ fegetenv (fenv_t *envp) /* Success. */ return 0; } +libm_hidden_def (fegetenv) From 0f507b6c95027a9247f7828ee0f4318951a9d455 Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Tue, 9 Feb 2010 22:42:38 -0800 Subject: [PATCH 02/27] power7-optimized classification functions --- ChangeLog | 15 +++ .../powerpc/powerpc32/power7/fpu/s_finite.S | 89 ++++++++++++++++++ .../powerpc/powerpc32/power7/fpu/s_finitef.S | 1 + .../powerpc/powerpc32/power7/fpu/s_isinf.S | 89 ++++++++++++++++++ .../powerpc/powerpc32/power7/fpu/s_isinff.S | 1 + .../powerpc/powerpc32/power7/fpu/s_isnan.S | 92 +++++++++++++++++++ .../powerpc/powerpc32/power7/fpu/s_isnanf.S | 1 + .../powerpc/powerpc64/power7/fpu/s_finite.S | 69 ++++++++++++++ .../powerpc/powerpc64/power7/fpu/s_finitef.S | 1 + .../powerpc/powerpc64/power7/fpu/s_isinf.S | 72 +++++++++++++++ .../powerpc/powerpc64/power7/fpu/s_isinff.S | 1 + .../powerpc/powerpc64/power7/fpu/s_isnan.S | 70 ++++++++++++++ .../powerpc/powerpc64/power7/fpu/s_isnanf.S | 1 + 13 files changed, 502 insertions(+) create mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S create mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S create mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S create mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S create mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S create mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S diff --git a/ChangeLog b/ChangeLog index 9b24c0b4a1..d9e1c3c4e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2010-02-08 Luis Machado + + * sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S: New file. + * sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S: New file. + * sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S: New file. + * sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S: New file. + * sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S: New file. + * sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S: New file. + * sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S: New file. + * sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S: New file. + * sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S: New file. + * sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S: New file. + * sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S: New file. + * sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S: New file. + 2010-02-08 Andreas Schwab * include/fenv.h: Add hidden proto for fegetenv. diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S new file mode 100644 index 0000000000..5b0d950c74 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S @@ -0,0 +1,89 @@ +/* finite(). PowerPC32/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __finite(x) */ + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC0: /* 1.0 */ + .quad 0x3ff0000000000000 + + .section ".text" + .type __finite, @function + .machine power7 +ENTRY (__finite) +#ifdef SHARED + mflr r11 + cfi_register(lr,r11) + + bcl 20,31,1f +1: mflr r9 + addis r9,r9,.LC0-1b@ha + lfd fp0,.LC0-1b@l(r9) + + mtlr r11 + cfi_same_value (lr) +#else + lis r9,.LC0@ha + lfd fp0,.LC0@l(r9) +#endif + ftdiv cr7,fp1,fp0 + li r3,1 + bflr 30 + + /* We have -INF/+INF/NaN or a denormal. */ + + stwu r1,-16(r1) /* Allocate stack space. */ + stfd fp1,8(r1) /* Transfer FP to GPR's. */ + + ori 2,2,0 /* Force a new dispatch group. */ + lhz r0,8(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + clrlwi r0,r0,17 /* r0 = abs(r0). */ + addi r1,r1,16 /* Reset the stack pointer. */ + cmpwi cr7,r0,0x7ff0 /* r4 == 0x7ff0?. */ + bltlr cr7 /* LT means we have a denormal. */ + li r3,0 + blr + END (__finite) + +hidden_def (__finite) +weak_alias (__finite, finite) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__finite, __finitef) +hidden_def (__finitef) +weak_alias (__finitef, finitef) + +#ifdef NO_LONG_DOUBLE +strong_alias (__finite, __finitel) +weak_alias (__finite, finitel) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __finite, __finitel, GLIBC_2_0); +compat_symbol (libc, finite, finitel, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S new file mode 100644 index 0000000000..54bd94176d --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S @@ -0,0 +1 @@ +/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S new file mode 100644 index 0000000000..1c8a38496c --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S @@ -0,0 +1,89 @@ +/* isinf(). PowerPC32/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __isinf(x) */ + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC0: /* 1.0 */ + .quad 0x3ff0000000000000 + + .section ".text" + .type __isinf, @function + .machine power7 +ENTRY (__isinf) +#ifdef SHARED + mflr r11 + cfi_register(lr,r11) + + bcl 20,31,1f +1: mflr r9 + addis r9,r9,.LC0-1b@ha + lfd fp0,.LC0-1b@l(r9) + + mtlr r11 + cfi_same_value (lr) +#else + lis r9,.LC0@ha + lfd fp0,.LC0@l(r9) +#endif + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 29 /* If not INF, return. */ + + /* Either we have -INF/+INF or a denormal. */ + + stwu r1,-16(r1) /* Allocate stack space. */ + stfd fp1,8(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lhz r4,8(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + addi r1,r1,16 /* Reset the stack pointer. */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + li r3,1 + beqlr cr7 /* EQ means INF, otherwise -INF. */ + li r3,-1 + blr + END (__isinf) + +hidden_def (__isinf) +weak_alias (__isinf, isinf) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isinf, __isinff) +hidden_def (__isinff) +weak_alias (__isinff, isinff) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isinf, __isinfl) +weak_alias (__isinf, isinfl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); +compat_symbol (libc, isinf, isinfl, GLIBC_2_0); +# endif +#endif + diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S new file mode 100644 index 0000000000..be759e091e --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S new file mode 100644 index 0000000000..852539f24b --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S @@ -0,0 +1,92 @@ +/* isnan(). PowerPC32/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __isnan(x) */ + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC0: /* 1.0 */ + .quad 0x3ff0000000000000 + + .section ".text" + .type __isnan, @function + .machine power7 +ENTRY (__isnan) +#ifdef SHARED + mflr r11 + cfi_register(lr,r11) + + bcl 20,31,1f +1: mflr r9 + addis r9,r9,.LC0-1b@ha + lfd fp0,.LC0-1b@l(r9) + + mtlr r11 + cfi_same_value (lr) +#else + lis r9,.LC0@ha + lfd fp0,.LC0@l(r9) +#endif + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 30 /* If not NaN or Inf, finish. */ + + /* We have -INF/+INF/NaN or a denormal. */ + + stwu r1,-16(r1) /* Allocate stack space. */ + stfd fp1,8(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lwz r4,8(r1) /* Load the upper half of the FP value. */ + lwz r5,12(r1) /* Load the lower half of the FP value. */ + addi r1,r1,16 /* Reset the stack pointer. */ + lis r0,0x7ff0 /* Load the upper portion for an INF/NaN. */ + clrlwi r4,r4,1 /* r4 = abs(r4). */ + cmpw cr7,r4,r0 /* if (abs(r4) <= inf). */ + cmpwi cr6,r5,0 /* r5 == 0x00000000? */ + bltlr cr7 /* LT means we have a denormal. */ + bgt cr7,L(NaN) /* GT means we have a NaN. */ + beqlr cr6 /* EQ means we have +/-INF. */ +L(NaN): + li r3,1 /* x == NaN? */ + blr + END (__isnan) + +hidden_def (__isnan) +weak_alias (__isnan, isnan) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isnan, __isnanf) +hidden_def (__isnanf) +weak_alias (__isnanf, isnanf) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isnan, __isnanl) +weak_alias (__isnan, isnanl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); +compat_symbol (libc, isnan, isnanl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S new file mode 100644 index 0000000000..b48c85e0d3 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isnan.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S new file mode 100644 index 0000000000..e76e8a5961 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S @@ -0,0 +1,69 @@ +/* finite(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __finite(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __finite, @function + .machine power7 +EALIGN (__finite, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,1 + bflr 30 + + /* If we are here, we either have +/-INF, + NaN or denormal. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + + lhz r4,-16(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + clrlwi r4,r4,17 /* r4 = abs(r4). */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + bltlr cr7 /* LT means finite, other non-finite. */ + li r3,0 + blr + END (__finite) + +hidden_def (__finite) +weak_alias (__finite, finite) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__finite, __finitef) +hidden_def (__finitef) +weak_alias (__finitef, finitef) + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __finite, __finitel, GLIBC_2_0); +compat_symbol (libc, finite, finitel, GLIBC_2_0); +# endif +#endif + diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S new file mode 100644 index 0000000000..54bd94176d --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S @@ -0,0 +1 @@ +/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S new file mode 100644 index 0000000000..9a72ce7143 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S @@ -0,0 +1,72 @@ +/* isinf(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __isinf(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __isinf, @function + .machine power7 +EALIGN (__isinf, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 29 /* If not INF, return. */ + + /* Either we have -INF/+INF or a denormal. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lhz r4,-16(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + li r3,1 + beqlr cr7 /* EQ means INF, otherwise -INF. */ + li r3,-1 + blr + END (__isinf) + +hidden_def (__isinf) +weak_alias (__isinf, isinf) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isinf, __isinff) +hidden_def (__isinff) +weak_alias (__isinff, isinff) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isinf, __isinfl) +weak_alias (__isinf, isinfl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); +compat_symbol (libc, isinf, isinfl, GLIBC_2_0); +# endif +#endif + diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S new file mode 100644 index 0000000000..be759e091e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S new file mode 100644 index 0000000000..cd9c75ef6f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S @@ -0,0 +1,70 @@ +/* isnan(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __isnan(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __isnan, @function + .machine power7 +EALIGN (__isnan, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 30 /* If not NaN, finish. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + ld r4,-16(r1) /* Load FP into GPR. */ + lis r0,0x7ff0 + sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */ + clrldi r4,r4,1 /* x = fabs(x) */ + cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */ + blelr cr7 /* LE means not NaN. */ + li r3,1 /* else return 1 */ + blr + END (__isnan) + +hidden_def (__isnan) +weak_alias (__isnan, isnan) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isnan, __isnanf) +hidden_def (__isnanf) +weak_alias (__isnanf, isnanf) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isnan, __isnanl) +weak_alias (__isnan, isnanl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); +compat_symbol (libc, isnan, isnanl, GLIBC_2_0); +# endif +#endif + diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S new file mode 100644 index 0000000000..b48c85e0d3 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isnan.S. */ From 029f8f41c7833d61755649f39f48e57a6d9a0fba Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 9 Feb 2010 22:46:23 -0800 Subject: [PATCH 03/27] Fix whitespace issues. --- sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S | 1 - sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S | 1 - sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S | 1 - sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S | 1 - 4 files changed, 4 deletions(-) diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S index 1c8a38496c..2979534911 100644 --- a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S +++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S @@ -86,4 +86,3 @@ compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); compat_symbol (libc, isinf, isinfl, GLIBC_2_0); # endif #endif - diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S index e76e8a5961..6763d1adc8 100644 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S @@ -66,4 +66,3 @@ compat_symbol (libc, __finite, __finitel, GLIBC_2_0); compat_symbol (libc, finite, finitel, GLIBC_2_0); # endif #endif - diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S index 9a72ce7143..f896d38026 100644 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S @@ -69,4 +69,3 @@ compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); compat_symbol (libc, isinf, isinfl, GLIBC_2_0); # endif #endif - diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S index cd9c75ef6f..8877012598 100644 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S @@ -67,4 +67,3 @@ compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); compat_symbol (libc, isnan, isnanl, GLIBC_2_0); # endif #endif - From 1d7223f31af8c35053e7a6fbac3070bcaa7bf87d Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 9 Feb 2010 22:54:36 -0800 Subject: [PATCH 04/27] Fix i386 __mpn_lshift unwind info --- ChangeLog | 4 ++++ sysdeps/i386/lshift.S | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index d9e1c3c4e2..98a77abd71 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2010-02-09 Ulrich Drepper + + * sysdeps/i386/lshift.S: Fix unwind information. + 2010-02-08 Luis Machado * sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S: New file. diff --git a/sysdeps/i386/lshift.S b/sysdeps/i386/lshift.S index 536d9878eb..398cf038c7 100644 --- a/sysdeps/i386/lshift.S +++ b/sysdeps/i386/lshift.S @@ -1,5 +1,5 @@ /* i80386 __mpn_lshift -- - Copyright (C) 1992, 1994, 1997-2000, 2005 Free Software Foundation, Inc. + Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -55,6 +55,7 @@ ENTRY (BP_SYM (__mpn_lshift)) movl (%esi,%edx,4),%ebx /* read most significant limb */ cfi_rel_offset (ebx, 0) + cfi_remember_state xorl %eax,%eax shldl %cl,%ebx,%eax /* compute carry limb */ decl %edx @@ -95,6 +96,7 @@ L(1): movl (%esi,%edx,4),%eax LEAVE ret + cfi_restore_state L(end): shll %cl,%ebx /* compute least significant limb */ movl %ebx,(%edi) /* store it */ From 0ee5660b7c592aa41a8e86e801c379864f9489a4 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 9 Feb 2010 22:59:11 -0800 Subject: [PATCH 05/27] More compact unwind info. --- ChangeLog | 2 ++ sysdeps/i386/rshift.S | 8 +++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 98a77abd71..c8ef5c7723 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2010-02-09 Ulrich Drepper + * sysdeps/i386/rshift.S: More compact unwind information. + * sysdeps/i386/lshift.S: Fix unwind information. 2010-02-08 Luis Machado diff --git a/sysdeps/i386/rshift.S b/sysdeps/i386/rshift.S index 3fd0afe822..332c4d09e7 100644 --- a/sysdeps/i386/rshift.S +++ b/sysdeps/i386/rshift.S @@ -1,5 +1,5 @@ /* i80386 __mpn_rshift -- - Copyright (C) 1992,1994,1997-2000,2005 Free Software Foundation, Inc. + Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify @@ -57,6 +57,7 @@ ENTRY (BP_SYM (__mpn_rshift)) movl (%esi,%edx,4),%ebx /* read least significant limb */ cfi_rel_offset (ebx, 0) + cfi_remember_state xorl %eax,%eax shrdl %cl,%ebx,%eax /* compute carry limb */ incl %edx @@ -97,10 +98,7 @@ L(1): movl (%esi,%edx,4),%eax LEAVE ret - cfi_adjust_cfa_offset (12) - cfi_rel_offset (edi, 8) - cfi_rel_offset (esi, 4) - cfi_rel_offset (ebx, 0) + cfi_restore_state L(end): shrl %cl,%ebx /* compute most significant limb */ movl %ebx,(%edi) /* store it */ From 2c2243f7081f66542249cf990d8444105000077b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 10 Feb 2010 00:04:49 -0800 Subject: [PATCH 06/27] Undo unintended maloc change. --- malloc/malloc.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/malloc/malloc.c b/malloc/malloc.c index acf1bec500..b43e454f6e 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1,5 +1,5 @@ /* Malloc implementation for multiple threads without lock contention. - Copyright (C) 1996-2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 1996-2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Wolfram Gloger and Doug Lea , 2001. @@ -4850,8 +4850,7 @@ _int_free(mstate av, mchunkptr p) free_perturb (chunk2mem(p), size - SIZE_SZ); set_fastchunks(av); - unsigned int idx = fastbin_index(size); - fb = &fastbin (av, idx); + fb = &fastbin (av, fastbin_index(size)); #ifdef ATOMIC_FASTBINS mchunkptr fd; @@ -4865,12 +4864,6 @@ _int_free(mstate av, mchunkptr p) errstr = "double free or corruption (fasttop)"; goto errout; } - if (old != NULL && (chunksize(old) > request2size(MAX_FAST_SIZE) - || fastbin_index(chunksize(old)) != idx)) - { - errstr = "invalid fastbin entry (free)"; - goto errout; - } p->fd = fd = old; } while ((old = catomic_compare_and_exchange_val_rel (fb, p, fd)) != fd); @@ -4882,12 +4875,6 @@ _int_free(mstate av, mchunkptr p) errstr = "double free or corruption (fasttop)"; goto errout; } - if (*fb != NULL && (chunksize(*fb) > request2size(MAX_FAST_SIZE) - || fastbin_index(chunksize(*fb)) != idx)) - { - errstr = "invalid fastbin entry (free)"; - goto errout; - } p->fd = *fb; *fb = p; From 61c9346ddce66a34c9823aa8dee7cfcbc8bb9e62 Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Wed, 10 Feb 2010 07:15:01 -0800 Subject: [PATCH 07/27] Fix POWER7 Implies --- ChangeLog | 11 +++++++++++ sysdeps/powerpc/powerpc32/power7/Implies | 1 - sysdeps/powerpc/powerpc32/power7/fpu/Implies | 1 - sysdeps/powerpc/powerpc64/power7/Implies | 1 - sysdeps/powerpc/powerpc64/power7/fpu/Implies | 1 - .../sysv/linux/powerpc/powerpc32/power7/fpu/Implies | 1 + .../sysv/linux/powerpc/powerpc64/power7/fpu/Implies | 1 + 7 files changed, 13 insertions(+), 4 deletions(-) delete mode 100644 sysdeps/powerpc/powerpc32/power7/Implies delete mode 100644 sysdeps/powerpc/powerpc32/power7/fpu/Implies delete mode 100644 sysdeps/powerpc/powerpc64/power7/Implies delete mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/Implies diff --git a/ChangeLog b/ChangeLog index c8ef5c7723..5ab2cbcc17 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2010-02-10 Luis Machado + + * sysdeps/powerpc/powerpc64/power7/Implies: Removed. + * sysdeps/powerpc/powerpc64/power7/fpu/Implies: Removed. + * sysdeps/powerpc/powerpc32/power7/Implies: Removed. + * sysdeps/powerpc/powerpc32/power7/fpu/Implies: Removed. + * sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies: Add + 64-bit power7 fpu path. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies: Add + 32-bit power7 fpu math. + 2010-02-09 Ulrich Drepper * sysdeps/i386/rshift.S: More compact unwind information. diff --git a/sysdeps/powerpc/powerpc32/power7/Implies b/sysdeps/powerpc/powerpc32/power7/Implies deleted file mode 100644 index 03899d8a3c..0000000000 --- a/sysdeps/powerpc/powerpc32/power7/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc32/power5 diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/powerpc/powerpc32/power7/fpu/Implies deleted file mode 100644 index 819a7d7979..0000000000 --- a/sysdeps/powerpc/powerpc32/power7/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc32/power5/fpu diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies deleted file mode 100644 index 13b03309fb..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power5 diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies deleted file mode 100644 index 13b03309fb..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power5 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies index d379a2dd12..af946119aa 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies @@ -1,3 +1,4 @@ # Make sure this comes before the powerpc/powerpc32/fpu that's # listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies. +powerpc/powerpc32/power7/fpu powerpc/powerpc32/power5/fpu diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies index c46b3d42af..ca112208d1 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies @@ -1,3 +1,4 @@ # Make sure this comes before the powerpc/powerpc64/fpu that's # listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies. +powerpc/powerpc64/power7/fpu powerpc/powerpc64/power5/fpu From bbabf1f73f611b9101b42c3390a9ddd53e6dd7d4 Mon Sep 17 00:00:00 2001 From: Maxim Kuvyrkov Date: Wed, 10 Feb 2010 07:24:21 -0800 Subject: [PATCH 08/27] Add m68k TLS relocations --- ChangeLog | 4 ++++ elf/elf.h | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 5ab2cbcc17..cea1089875 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2010-02-09 Maxim Kuvyrkov + + * elf/elf.h: Define m68k TLS relocations. + 2010-02-10 Luis Machado * sysdeps/powerpc/powerpc64/power7/Implies: Removed. diff --git a/elf/elf.h b/elf/elf.h index 8af7c177ce..8b1ee47867 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -1123,8 +1123,29 @@ typedef struct #define R_68K_GLOB_DAT 20 /* Create GOT entry */ #define R_68K_JMP_SLOT 21 /* Create PLT entry */ #define R_68K_RELATIVE 22 /* Adjust by program base */ +#define R_68K_TLS_GD32 25 /* 32 bit GOT offset for GD */ +#define R_68K_TLS_GD16 26 /* 16 bit GOT offset for GD */ +#define R_68K_TLS_GD8 27 /* 8 bit GOT offset for GD */ +#define R_68K_TLS_LDM32 28 /* 32 bit GOT offset for LDM */ +#define R_68K_TLS_LDM16 29 /* 16 bit GOT offset for LDM */ +#define R_68K_TLS_LDM8 30 /* 8 bit GOT offset for LDM */ +#define R_68K_TLS_LDO32 31 /* 32 bit module-relative offset */ +#define R_68K_TLS_LDO16 32 /* 16 bit module-relative offset */ +#define R_68K_TLS_LDO8 33 /* 8 bit module-relative offset */ +#define R_68K_TLS_IE32 34 /* 32 bit GOT offset for IE */ +#define R_68K_TLS_IE16 35 /* 16 bit GOT offset for IE */ +#define R_68K_TLS_IE8 36 /* 8 bit GOT offset for IE */ +#define R_68K_TLS_LE32 37 /* 32 bit offset relative to + static TLS block */ +#define R_68K_TLS_LE16 38 /* 16 bit offset relative to + static TLS block */ +#define R_68K_TLS_LE8 39 /* 8 bit offset relative to + static TLS block */ +#define R_68K_TLS_DTPMOD32 40 /* 32 bit module number */ +#define R_68K_TLS_DTPREL32 41 /* 32 bit module-relative offset */ +#define R_68K_TLS_TPREL32 42 /* 32 bit TP-relative offset */ /* Keep this the last entry. */ -#define R_68K_NUM 23 +#define R_68K_NUM 43 /* Intel 80386 specific definitions. */ From 57e065ae1eda898a42d9cb8e2ea344290d05578c Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 10 Feb 2010 07:34:34 -0800 Subject: [PATCH 09/27] Some updates to the README file --- README | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README b/README index 322b4dac4a..7c27e3652a 100644 --- a/README +++ b/README @@ -14,20 +14,23 @@ In GNU/Hurd systems, it works with a microkernel and Hurd servers. The GNU C Library implements much of the POSIX.1 functionality in the GNU/Hurd system, using configurations i[34567]86-*-gnu. -When working with Linux kernels, the GNU C Library version 2.4 is -intended primarily for use with Linux kernel version 2.6.0 and later. -We only support using the NPTL implementation of pthreads, which is now -the default configuration. Most of the C library will continue to work -on older Linux kernels and many programs will not require a 2.6 kernel -to run correctly. However, pthreads and related functionality will not -work at all on old kernels and we do not recommend using glibc 2.4 with -any Linux kernel prior to 2.6. +When working with Linux kernels, the GNU C Library version from +version 2.4 on is intended primarily for use with Linux kernel version +2.6.0 and later. We only support using the NPTL implementation of +pthreads, which is now the default configuration. Most of the C +library will continue to work on older Linux kernels and many programs +will not require a 2.6 kernel to run correctly. However, pthreads and +related functionality will not work at all on old kernels and we do +not recommend using glibc 2.4 with any Linux kernel prior to 2.6. All Linux kernel versions prior to 2.6.16 are known to have some bugs that may cause some of the tests related to pthreads in "make check" to fail. If you see such problems, please try the test suite on the most recent Linux kernel version that you can use, before pursuing those bugs further. +Also note that the shared version of the libgcc_s library must be +installed for the pthread library to work correctly. + The old LinuxThreads add-on implementation of pthreads for older Linux kernels is no longer supported, and we are not distributing it with this release. Someone has volunteered to revive its maintenance unofficially @@ -48,7 +51,6 @@ The GNU C Library supports these configurations for using Linux kernels: sparc*-*-linux-gnu sparc64*-*-linux-gnu - alpha*-*-linux-gnu Requires Linux 2.6.9 for NPTL sh[34]-*-linux-gnu Requires Linux 2.6.11 The code for other CPU configurations supported by volunteers outside of @@ -57,6 +59,7 @@ add-on. You can find glibc-ports-VERSION distributed separately in the same place where you got the main glibc distribution files. Currently these configurations are known to work using the `ports' add-on: + alpha*-*-linux-gnu Requires Linux 2.6.9 for NPTL arm-*-linux-gnu Requires Linux 2.6.15 for NPTL, no SMP support arm-*-linux-gnueabi Requires Linux 2.6.16-rc1 for NPTL, no SMP mips-*-linux-gnu Requires Linux 2.6.12 for NPTL From 311927f1bd58b6e2b36e3bbb83bbc4dbdb507155 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 Feb 2010 07:42:17 -0800 Subject: [PATCH 10/27] Sparc STT_GNU_IFUNC support --- ChangeLog | 16 +++ elf/elf.h | 2 + sysdeps/sparc/sparc32/dl-irel.h | 55 +++++++++ sysdeps/sparc/sparc32/dl-machine.h | 81 +++++--------- sysdeps/sparc/sparc32/dl-plt.h | 62 +++++++++++ sysdeps/sparc/sparc64/dl-irel.h | 58 ++++++++++ sysdeps/sparc/sparc64/dl-machine.h | 172 ++++++----------------------- sysdeps/sparc/sparc64/dl-plt.h | 144 ++++++++++++++++++++++++ 8 files changed, 402 insertions(+), 188 deletions(-) create mode 100644 sysdeps/sparc/sparc32/dl-irel.h create mode 100644 sysdeps/sparc/sparc32/dl-plt.h create mode 100644 sysdeps/sparc/sparc64/dl-irel.h create mode 100644 sysdeps/sparc/sparc64/dl-plt.h diff --git a/ChangeLog b/ChangeLog index cea1089875..4e84709413 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2010-02-05 David S. Miller + + * elf/elf.h (R_SPARC_JMP_IREL, R_SPARC_IRELATIVE): Define. + * sysdeps/sparc/sparc32/dl-machine.h (elf_machine_rela): Handle new + ifunc relocs. + (elf_machine_lazy_rel): Likewise. + (sparc_fixup_plt): Pull out to... + * sysdeps/sparc/sparc32/dl-plt.h: ...here. + * sysdeps/sparc/sparc32/dl-irel.h: New file. + * sysdeps/sparc/sparc64/dl-machine.h (elf_machine_rela): Handle new + ifunc relocs. + (elf_machine_lazy_rel): Likewise. + (sparc64_fixup_plt): Pull out to... + * sysdeps/sparc/sparc64/dl-plt.h: ...here. + * sysdeps/sparc/sparc64/dl-irel.h: New file. + 2010-02-09 Maxim Kuvyrkov * elf/elf.h: Define m68k TLS relocations. diff --git a/elf/elf.h b/elf/elf.h index 8b1ee47867..204a0f9e19 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -1324,6 +1324,8 @@ typedef struct #define R_SPARC_H34 85 #define R_SPARC_SIZE32 86 #define R_SPARC_SIZE64 87 +#define R_SPARC_JMP_IREL 248 +#define R_SPARC_IRELATIVE 249 #define R_SPARC_GNU_VTINHERIT 250 #define R_SPARC_GNU_VTENTRY 251 #define R_SPARC_REV32 252 diff --git a/sysdeps/sparc/sparc32/dl-irel.h b/sysdeps/sparc/sparc32/dl-irel.h new file mode 100644 index 0000000000..1891938d6d --- /dev/null +++ b/sysdeps/sparc/sparc32/dl-irel.h @@ -0,0 +1,55 @@ +/* Machine-dependent ELF indirect relocation inline functions. + SPARC 32-bit version. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _DL_IREL_H +#define _DL_IREL_H + +#include +#include +#include + +#define ELF_MACHINE_IRELA 1 + +static inline void +__attribute ((always_inline)) +elf_irela (const Elf32_Rela *reloc) +{ + unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + + if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1)) + { + Elf32_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) (); + *reloc_addr = value; + } + else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1)) + { + Elf32_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) (); + + sparc_fixup_plt (reloc, reloc_addr, value, 0, 1); + } + else if (r_type == R_SPARC_NONE) + ; + else + __libc_fatal ("unexpected reloc type in static binary"); +} + +#endif /* dl-irel.h */ diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h index b3b7852d87..e1385f7aca 100644 --- a/sysdeps/sparc/sparc32/dl-machine.h +++ b/sysdeps/sparc/sparc32/dl-machine.h @@ -1,5 +1,5 @@ /* Machine-dependent ELF dynamic relocation inline functions. SPARC version. - Copyright (C) 1996-2003, 2004, 2005, 2006, 2007 + Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -27,20 +27,13 @@ #include #include #include +#include #ifndef VALIDX # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \ + DT_EXTRANUM + DT_VALTAGIDX (tag)) #endif -/* Some SPARC opcodes we need to use for self-modifying code. */ -#define OPCODE_NOP 0x01000000 /* nop */ -#define OPCODE_CALL 0x40000000 /* call ?; add PC-rel word address */ -#define OPCODE_SETHI_G1 0x03000000 /* sethi ?, %g1; add value>>10 */ -#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */ -#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */ -#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */ - /* Return nonzero iff ELF header is compatible with the running host. */ static inline int elf_machine_matches_host (const Elf32_Ehdr *ehdr) @@ -312,41 +305,6 @@ _dl_start_user:\n\ .size _dl_start_user, . - _dl_start_user\n\ .previous"); -static inline __attribute__ ((always_inline)) Elf32_Addr -sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr, - Elf32_Addr value, int t, int do_flush) -{ - Elf32_Sword disp = value - (Elf32_Addr) reloc_addr; - - if (0 && disp >= -0x800000 && disp < 0x800000) - { - /* Don't need to worry about thread safety. We're writing just one - instruction. */ - - reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff); - if (do_flush) - __asm __volatile ("flush %0" : : "r"(reloc_addr)); - } - else - { - /* For thread safety, write the instructions from the bottom and - flush before we overwrite the critical "b,a". This of course - need not be done during bootstrapping, since there are no threads. - But we also can't tell if we _can_ use flush, so don't. */ - - reloc_addr += t; - reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff); - if (do_flush) - __asm __volatile ("flush %0+4" : : "r"(reloc_addr)); - - reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10); - if (do_flush) - __asm __volatile ("flush %0" : : "r"(reloc_addr)); - } - - return value; -} - static inline Elf32_Addr elf_machine_fixup_plt (struct link_map *map, lookup_t t, const Elf32_Rela *reloc, @@ -433,6 +391,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, value += reloc->r_addend; /* Assume copy relocs have zero addend. */ + if (sym != NULL + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)) + { + value = ((Elf32_Addr (*) (void)) value) (); + } + switch (r_type) { #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP @@ -460,6 +425,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, case R_SPARC_32: *reloc_addr = value; break; + case R_SPARC_IRELATIVE: + value = ((Elf32_Addr (*) (void)) value) (); + *reloc_addr = value; + break; + case R_SPARC_JMP_IREL: + value = ((Elf32_Addr (*) (void)) value) (); + /* Fall thru */ case R_SPARC_JMP_SLOT: { #if !defined RTLD_BOOTSTRAP && !defined __sparc_v9__ @@ -578,16 +550,21 @@ __attribute__ ((always_inline)) elf_machine_lazy_rel (struct link_map *map, Elf32_Addr l_addr, const Elf32_Rela *reloc) { - switch (ELF32_R_TYPE (reloc->r_info)) + Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + + if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1)) + ; + else if (r_type == R_SPARC_JMP_IREL) { - case R_SPARC_NONE: - break; - case R_SPARC_JMP_SLOT: - break; - default: - _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1); - break; + Elf32_Addr value = map->l_addr + reloc->r_addend; + value = ((Elf32_Addr (*) (void)) value) (); + sparc_fixup_plt (reloc, reloc_addr, value, 0, 1); } + else if (r_type == R_SPARC_NONE) + ; + else + _dl_reloc_bad_type (map, r_type, 1); } #endif /* RESOLVE_MAP */ diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h new file mode 100644 index 0000000000..edcc5c1374 --- /dev/null +++ b/sysdeps/sparc/sparc32/dl-plt.h @@ -0,0 +1,62 @@ +/* PLT fixups. Sparc 32-bit version. + Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* Some SPARC opcodes we need to use for self-modifying code. */ +#define OPCODE_NOP 0x01000000 /* nop */ +#define OPCODE_CALL 0x40000000 /* call ?; add PC-rel word address */ +#define OPCODE_SETHI_G1 0x03000000 /* sethi ?, %g1; add value>>10 */ +#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */ +#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */ +#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */ + +static inline __attribute__ ((always_inline)) Elf32_Addr +sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr, + Elf32_Addr value, int t, int do_flush) +{ + Elf32_Sword disp = value - (Elf32_Addr) reloc_addr; + + if (0 && disp >= -0x800000 && disp < 0x800000) + { + /* Don't need to worry about thread safety. We're writing just one + instruction. */ + + reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff); + if (do_flush) + __asm __volatile ("flush %0" : : "r"(reloc_addr)); + } + else + { + /* For thread safety, write the instructions from the bottom and + flush before we overwrite the critical "b,a". This of course + need not be done during bootstrapping, since there are no threads. + But we also can't tell if we _can_ use flush, so don't. */ + + reloc_addr += t; + reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff); + if (do_flush) + __asm __volatile ("flush %0+4" : : "r"(reloc_addr)); + + reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10); + if (do_flush) + __asm __volatile ("flush %0" : : "r"(reloc_addr)); + } + + return value; +} diff --git a/sysdeps/sparc/sparc64/dl-irel.h b/sysdeps/sparc/sparc64/dl-irel.h new file mode 100644 index 0000000000..1a2a0a3dd5 --- /dev/null +++ b/sysdeps/sparc/sparc64/dl-irel.h @@ -0,0 +1,58 @@ +/* Machine-dependent ELF indirect relocation inline functions. + SPARC 64-bit version. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _DL_IREL_H +#define _DL_IREL_H + +#include +#include +#include + +#define ELF_MACHINE_IRELA 1 + +static inline void +__attribute ((always_inline)) +elf_irela (const Elf64_Rela *reloc) +{ + unsigned int r_type = (reloc->r_info & 0xff); + + if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1)) + { + Elf64_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) (); + *reloc_addr = value; + } + else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1)) + { + Elf64_Addr *const reloc_addr = (void *) reloc->r_offset; + Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) (); + struct link_map map = { .l_addr = 0 }; + + /* 'high' is always zero, for large PLT entries the linker + emits an R_SPARC_IRELATIVE. */ + sparc64_fixup_plt (&map, reloc, reloc_addr, value, 0, 0); + } + else if (r_type == R_SPARC_NONE) + ; + else + __libc_fatal ("unexpected reloc type in static binary"); +} + +#endif /* dl-irel.h */ diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h index 3eee672912..b4f43e9cf5 100644 --- a/sysdeps/sparc/sparc64/dl-machine.h +++ b/sysdeps/sparc/sparc64/dl-machine.h @@ -1,6 +1,6 @@ /* Machine-dependent ELF dynamic relocation inline functions. Sparc64 version. - Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006 - Free Software Foundation, Inc. + Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, + 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -27,6 +27,7 @@ #include #include #include +#include #ifndef VALIDX # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \ @@ -89,132 +90,6 @@ elf_machine_load_address (void) return (Elf64_Addr) got - *got + (Elf32_Sword) ((pc[2] - pc[3]) * 4) - 4; } -/* We have 4 cases to handle. And we code different code sequences - for each one. I love V9 code models... */ -static inline void __attribute__ ((always_inline)) -sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc, - Elf64_Addr *reloc_addr, Elf64_Addr value, - Elf64_Addr high, int t) -{ - unsigned int *insns = (unsigned int *) reloc_addr; - Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr; - Elf64_Sxword disp = value - plt_vaddr; - - /* Now move plt_vaddr up to the call instruction. */ - plt_vaddr += ((t + 1) * 4); - - /* PLT entries .PLT32768 and above look always the same. */ - if (__builtin_expect (high, 0) != 0) - { - *reloc_addr = value - map->l_addr; - } - /* Near destination. */ - else if (disp >= -0x800000 && disp < 0x800000) - { - /* As this is just one instruction, it is thread safe and so - we can avoid the unnecessary sethi FOO, %g1. - b,a target */ - insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff); - __asm __volatile ("flush %0" : : "r" (insns)); - } - /* 32-bit Sparc style, the target is in the lower 32-bits of - address space. */ - else if (insns += t, (value >> 32) == 0) - { - /* sethi %hi(target), %g1 - jmpl %g1 + %lo(target), %g0 */ - - insns[1] = 0x81c06000 | (value & 0x3ff); - __asm __volatile ("flush %0 + 4" : : "r" (insns)); - - insns[0] = 0x03000000 | ((unsigned int)(value >> 10)); - __asm __volatile ("flush %0" : : "r" (insns)); - } - /* We can also get somewhat simple sequences if the distance between - the target and the PLT entry is within +/- 2GB. */ - else if ((plt_vaddr > value - && ((plt_vaddr - value) >> 31) == 0) - || (value > plt_vaddr - && ((value - plt_vaddr) >> 31) == 0)) - { - unsigned int displacement; - - if (plt_vaddr > value) - displacement = (0 - (plt_vaddr - value)); - else - displacement = value - plt_vaddr; - - /* mov %o7, %g1 - call displacement - mov %g1, %o7 */ - - insns[2] = 0x9e100001; - __asm __volatile ("flush %0 + 8" : : "r" (insns)); - - insns[1] = 0x40000000 | (displacement >> 2); - __asm __volatile ("flush %0 + 4" : : "r" (insns)); - - insns[0] = 0x8210000f; - __asm __volatile ("flush %0" : : "r" (insns)); - } - /* Worst case, ho hum... */ - else - { - unsigned int high32 = (value >> 32); - unsigned int low32 = (unsigned int) value; - - /* ??? Some tricks can be stolen from the sparc64 egcs backend - constant formation code I wrote. -DaveM */ - - if (__builtin_expect (high32 & 0x3ff, 0)) - { - /* sethi %hh(value), %g1 - sethi %lm(value), %g5 - or %g1, %hm(value), %g1 - or %g5, %lo(value), %g5 - sllx %g1, 32, %g1 - jmpl %g1 + %g5, %g0 - nop */ - - insns[5] = 0x81c04005; - __asm __volatile ("flush %0 + 20" : : "r" (insns)); - - insns[4] = 0x83287020; - __asm __volatile ("flush %0 + 16" : : "r" (insns)); - - insns[3] = 0x8a116000 | (low32 & 0x3ff); - __asm __volatile ("flush %0 + 12" : : "r" (insns)); - - insns[2] = 0x82106000 | (high32 & 0x3ff); - } - else - { - /* sethi %hh(value), %g1 - sethi %lm(value), %g5 - sllx %g1, 32, %g1 - or %g5, %lo(value), %g5 - jmpl %g1 + %g5, %g0 - nop */ - - insns[4] = 0x81c04005; - __asm __volatile ("flush %0 + 16" : : "r" (insns)); - - insns[3] = 0x8a116000 | (low32 & 0x3ff); - __asm __volatile ("flush %0 + 12" : : "r" (insns)); - - insns[2] = 0x83287020; - } - - __asm __volatile ("flush %0 + 8" : : "r" (insns)); - - insns[1] = 0x0b000000 | (low32 >> 10); - __asm __volatile ("flush %0 + 4" : : "r" (insns)); - - insns[0] = 0x03000000 | (high32 >> 10); - __asm __volatile ("flush %0" : : "r" (insns)); - } -} - static inline Elf64_Addr __attribute__ ((always_inline)) elf_machine_fixup_plt (struct link_map *map, lookup_t t, const Elf64_Rela *reloc, @@ -549,6 +424,11 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc, value += reloc->r_addend; /* Assume copy relocs have zero addend. */ + if (sym != NULL + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)) + value = ((Elf64_Addr (*) (void)) value) (); + switch (r_type) { #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP @@ -576,6 +456,13 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc, case R_SPARC_GLOB_DAT: *reloc_addr = value; break; + case R_SPARC_IRELATIVE: + value = ((Elf64_Addr (*) (void)) value) (); + *reloc_addr = value; + break; + case R_SPARC_JMP_IREL: + value = ((Elf64_Addr (*) (void)) value) (); + /* Fall thru */ case R_SPARC_JMP_SLOT: #ifdef RESOLVE_CONFLICT_FIND_MAP /* R_SPARC_JMP_SLOT conflicts against .plt[32768+] @@ -757,16 +644,29 @@ __attribute__ ((always_inline)) elf_machine_lazy_rel (struct link_map *map, Elf64_Addr l_addr, const Elf64_Rela *reloc) { - switch (ELF64_R_TYPE (reloc->r_info)) + Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned int r_type = ELF64_R_TYPE (reloc->r_info); + + if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1)) + ; + else if (r_type == R_SPARC_JMP_IREL + || r_type == R_SPARC_IRELATIVE) { - case R_SPARC_NONE: - break; - case R_SPARC_JMP_SLOT: - break; - default: - _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1); - break; + Elf64_Addr value = map->l_addr + reloc->r_addend; + value = ((Elf64_Addr (*) (void)) value) (); + if (r_type == R_SPARC_JMP_IREL) + { + /* 'high' is always zero, for large PLT entries the linker + emits an R_SPARC_IRELATIVE. */ + sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0); + } + else + *reloc_addr = value; } + else if (r_type == R_SPARC_NONE) + ; + else + _dl_reloc_bad_type (map, r_type, 1); } #endif /* RESOLVE_MAP */ diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h new file mode 100644 index 0000000000..e06be43a0a --- /dev/null +++ b/sysdeps/sparc/sparc64/dl-plt.h @@ -0,0 +1,144 @@ +/* PLT fixups. Sparc 64-bit version. + Copyright (C) 1997-2006, 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* We have 4 cases to handle. And we code different code sequences + for each one. I love V9 code models... */ +static inline void __attribute__ ((always_inline)) +sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc, + Elf64_Addr *reloc_addr, Elf64_Addr value, + Elf64_Addr high, int t) +{ + unsigned int *insns = (unsigned int *) reloc_addr; + Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr; + Elf64_Sxword disp = value - plt_vaddr; + + /* Now move plt_vaddr up to the call instruction. */ + plt_vaddr += ((t + 1) * 4); + + /* PLT entries .PLT32768 and above look always the same. */ + if (__builtin_expect (high, 0) != 0) + { + *reloc_addr = value - map->l_addr; + } + /* Near destination. */ + else if (disp >= -0x800000 && disp < 0x800000) + { + /* As this is just one instruction, it is thread safe and so + we can avoid the unnecessary sethi FOO, %g1. + b,a target */ + insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff); + __asm __volatile ("flush %0" : : "r" (insns)); + } + /* 32-bit Sparc style, the target is in the lower 32-bits of + address space. */ + else if (insns += t, (value >> 32) == 0) + { + /* sethi %hi(target), %g1 + jmpl %g1 + %lo(target), %g0 */ + + insns[1] = 0x81c06000 | (value & 0x3ff); + __asm __volatile ("flush %0 + 4" : : "r" (insns)); + + insns[0] = 0x03000000 | ((unsigned int)(value >> 10)); + __asm __volatile ("flush %0" : : "r" (insns)); + } + /* We can also get somewhat simple sequences if the distance between + the target and the PLT entry is within +/- 2GB. */ + else if ((plt_vaddr > value + && ((plt_vaddr - value) >> 31) == 0) + || (value > plt_vaddr + && ((value - plt_vaddr) >> 31) == 0)) + { + unsigned int displacement; + + if (plt_vaddr > value) + displacement = (0 - (plt_vaddr - value)); + else + displacement = value - plt_vaddr; + + /* mov %o7, %g1 + call displacement + mov %g1, %o7 */ + + insns[2] = 0x9e100001; + __asm __volatile ("flush %0 + 8" : : "r" (insns)); + + insns[1] = 0x40000000 | (displacement >> 2); + __asm __volatile ("flush %0 + 4" : : "r" (insns)); + + insns[0] = 0x8210000f; + __asm __volatile ("flush %0" : : "r" (insns)); + } + /* Worst case, ho hum... */ + else + { + unsigned int high32 = (value >> 32); + unsigned int low32 = (unsigned int) value; + + /* ??? Some tricks can be stolen from the sparc64 egcs backend + constant formation code I wrote. -DaveM */ + + if (__builtin_expect (high32 & 0x3ff, 0)) + { + /* sethi %hh(value), %g1 + sethi %lm(value), %g5 + or %g1, %hm(value), %g1 + or %g5, %lo(value), %g5 + sllx %g1, 32, %g1 + jmpl %g1 + %g5, %g0 + nop */ + + insns[5] = 0x81c04005; + __asm __volatile ("flush %0 + 20" : : "r" (insns)); + + insns[4] = 0x83287020; + __asm __volatile ("flush %0 + 16" : : "r" (insns)); + + insns[3] = 0x8a116000 | (low32 & 0x3ff); + __asm __volatile ("flush %0 + 12" : : "r" (insns)); + + insns[2] = 0x82106000 | (high32 & 0x3ff); + } + else + { + /* sethi %hh(value), %g1 + sethi %lm(value), %g5 + sllx %g1, 32, %g1 + or %g5, %lo(value), %g5 + jmpl %g1 + %g5, %g0 + nop */ + + insns[4] = 0x81c04005; + __asm __volatile ("flush %0 + 16" : : "r" (insns)); + + insns[3] = 0x8a116000 | (low32 & 0x3ff); + __asm __volatile ("flush %0 + 12" : : "r" (insns)); + + insns[2] = 0x83287020; + } + + __asm __volatile ("flush %0 + 8" : : "r" (insns)); + + insns[1] = 0x0b000000 | (low32 >> 10); + __asm __volatile ("flush %0 + 4" : : "r" (insns)); + + insns[0] = 0x03000000 | (high32 >> 10); + __asm __volatile ("flush %0" : : "r" (insns)); + } +} From 247fdf2e2b798378d8aeaac1ee6f4ba0a49f31f3 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 10 Feb 2010 20:31:48 -0800 Subject: [PATCH 11/27] Fix file descriotor leak in nftw with FTW_CHDIR --- ChangeLog | 6 ++++++ io/ftw.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 4e84709413..be8ba294fb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2010-02-10 Ulrich Drepper + + [BZ #11271] + * io/ftw.c (ftw_startup): Close descriptor for initial directory + after changing back to it. + 2010-02-05 David S. Miller * elf/elf.h (R_SPARC_JMP_IREL, R_SPARC_IRELATIVE): Define. diff --git a/io/ftw.c b/io/ftw.c index 9cc09077ed..bb7dba8ca8 100644 --- a/io/ftw.c +++ b/io/ftw.c @@ -1,5 +1,5 @@ /* File tree walker functions. - Copyright (C) 1996-2004, 2006, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 1996-2004, 2006-2008, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -790,6 +790,7 @@ ftw_startup (const char *dir, int is_nftw, void *func, int descriptors, { int save_err = errno; __fchdir (cwdfd); + close_not_cancel_no_status (cwdfd); __set_errno (save_err); } else if (cwd != NULL) From a3dc465852c38ffb4f329ca8b5b477a3c314c1ef Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Thu, 11 Feb 2010 07:22:58 -0800 Subject: [PATCH 12/27] S/390: Disable two UTF conversion instructions --- ChangeLog | 7 +++++++ sysdeps/s390/s390-64/utf16-utf32-z9.c | 11 ++++++++++- sysdeps/s390/s390-64/utf8-utf16-z9.c | 9 ++++++--- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index be8ba294fb..b13a74ca2f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2010-02-11 Andreas Krebbel + + * sysdeps/s390/s390-64/utf8-utf16-z9.c: Disable hardware + instructions cu21 and cu24. Add well-formedness checking + parameter and adjust the software implementation. + * sysdeps/s390/s390-64/utf16-utf32-z9.c: Likewise. + 2010-02-10 Ulrich Drepper [BZ #11271] diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c index 868dea68ca..14daf2118f 100644 --- a/sysdeps/s390/s390-64/utf16-utf32-z9.c +++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c @@ -203,7 +203,10 @@ gconv_end (struct __gconv_step *data) swapping). */ #define BODY \ { \ - if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + /* The hardware instruction currently fails to report an error for \ + isolated low surrogates so we have to disable the instruction \ + until this gets resolved. */ \ + if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \ { \ HARDWARE_CONVERT ("cu24 %0, %1, 1"); \ if (inptr != inend) \ @@ -229,6 +232,12 @@ gconv_end (struct __gconv_step *data) } \ else \ { \ + /* An isolated low-surrogate was found. This has to be \ + considered ill-formed. */ \ + if (__builtin_expect (u1 >= 0xdc00, 0)) \ + { \ + STANDARD_FROM_LOOP_ERR_HANDLER (2); \ + } \ /* It's a surrogate character. At least the first word says \ it is. */ \ if (__builtin_expect (inptr + 4 > inend, 0)) \ diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c index 531d3ebd4b..5f73f3c59e 100644 --- a/sysdeps/s390/s390-64/utf8-utf16-z9.c +++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c @@ -345,9 +345,12 @@ gconv_end (struct __gconv_step *data) Operation. */ #define BODY \ { \ - if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + /* The hardware instruction currently fails to report an error for \ + isolated low surrogates so we have to disable the instruction \ + until this gets resolved. */ \ + if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \ { \ - HARDWARE_CONVERT ("cu21 %0, %1"); \ + HARDWARE_CONVERT ("cu21 %0, %1, 1"); \ if (inptr != inend) \ { \ /* Check if the third byte is \ @@ -388,7 +391,7 @@ gconv_end (struct __gconv_step *data) \ outptr += 2; \ } \ - else if (c >= 0x0800 && c <= 0xd7ff) \ + else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \ { \ /* Three byte UTF-8 char. */ \ \ From c62cf60c6b37850ee68ea824666eadab99ff718f Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Fri, 12 Feb 2010 05:32:09 -0800 Subject: [PATCH 13/27] fix powerpc macros in tls-macros.h --- ChangeLog | 9 ++ elf/tls-macros.h | 234 +++++++++++++++++++++++------------------------ 2 files changed, 122 insertions(+), 121 deletions(-) diff --git a/ChangeLog b/ChangeLog index b13a74ca2f..919894ec7e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2010-02-12 Alan Modra + + * elf/tls-macros.h (__TLS_CALL_CLOBBERS <__powerpc__>): Remove r3. + Define and use for __powerpc64__ too. + (TLS_LD <__powerpc__>): Add r3 to clobbers. + (TLS_GD <__powerpc__>): Set asm output. Make __result r3 reg. + (TLS_GD <__powerpc64__>): Make __result r3 reg. + (TLS_IE <__powerpc64__>): Relax output constraint. + 2010-02-11 Andreas Krebbel * sysdeps/s390/s390-64/utf8-utf16-z9.c: Disable hardware diff --git a/elf/tls-macros.h b/elf/tls-macros.h index 6463a6c3f9..781256db1e 100644 --- a/elf/tls-macros.h +++ b/elf/tls-macros.h @@ -701,154 +701,146 @@ register void *__gp __asm__("$29"); (int *) (__builtin_thread_pointer() + __offset); }) # endif -#elif defined __powerpc__ && !defined __powerpc64__ +#elif defined __powerpc__ -#include "config.h" - -# define __TLS_CALL_CLOBBERS \ - "0", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", \ +# define __TLS_CALL_CLOBBERS \ + "0", "4", "5", "6", "7", "8", "9", "10", "11", "12", \ "lr", "ctr", "cr0", "cr1", "cr5", "cr6", "cr7" +# ifndef __powerpc64__ + +# include "config.h" + /* PowerPC32 Local Exec TLS access. */ -# define TLS_LE(x) \ - ({ int *__result; \ - asm ("addi %0,2," #x "@tprel" \ - : "=r" (__result)); \ +# define TLS_LE(x) \ + ({ int *__result; \ + asm ("addi %0,2," #x "@tprel" \ + : "=r" (__result)); \ __result; }) /* PowerPC32 Initial Exec TLS access. */ -# ifdef HAVE_ASM_PPC_REL16 -# define TLS_IE(x) \ - ({ int *__result; \ - asm ("bcl 20,31,1f\n1:\t" \ - "mflr %0\n\t" \ - "addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t" \ - "addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n\t" \ - "lwz %0," #x "@got@tprel(%0)\n\t" \ - "add %0,%0," #x "@tls" \ - : "=b" (__result) : \ - : "lr"); \ +# ifdef HAVE_ASM_PPC_REL16 +# define TLS_IE(x) \ + ({ int *__result; \ + asm ("bcl 20,31,1f\n1:\t" \ + "mflr %0\n\t" \ + "addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t" \ + "addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n\t" \ + "lwz %0," #x "@got@tprel(%0)\n\t" \ + "add %0,%0," #x "@tls" \ + : "=b" (__result) : \ + : "lr"); \ __result; }) -# else -# define TLS_IE(x) \ - ({ int *__result; \ - asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t" \ - "mflr %0\n\t" \ - "lwz %0," #x "@got@tprel(%0)\n\t" \ - "add %0,%0," #x "@tls" \ - : "=b" (__result) : \ - : "lr"); \ +# else +# define TLS_IE(x) \ + ({ int *__result; \ + asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t" \ + "mflr %0\n\t" \ + "lwz %0," #x "@got@tprel(%0)\n\t" \ + "add %0,%0," #x "@tls" \ + : "=b" (__result) : \ + : "lr"); \ __result; }) -# endif +# endif /* PowerPC32 Local Dynamic TLS access. */ -# ifdef HAVE_ASM_PPC_REL16 -# define TLS_LD(x) \ - ({ int *__result; \ - asm ("bcl 20,31,1f\n1:\t" \ - "mflr 3\n\t" \ - "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t" \ - "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t" \ - "addi 3,3," #x "@got@tlsld\n\t" \ - "bl __tls_get_addr@plt\n\t" \ - "addi %0,3," #x "@dtprel" \ - : "=r" (__result) : \ - : __TLS_CALL_CLOBBERS); \ +# ifdef HAVE_ASM_PPC_REL16 +# define TLS_LD(x) \ + ({ int *__result; \ + asm ("bcl 20,31,1f\n1:\t" \ + "mflr 3\n\t" \ + "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t" \ + "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t" \ + "addi 3,3," #x "@got@tlsld\n\t" \ + "bl __tls_get_addr@plt\n\t" \ + "addi %0,3," #x "@dtprel" \ + : "=r" (__result) : \ + : "3", __TLS_CALL_CLOBBERS); \ __result; }) -# else -# define TLS_LD(x) \ - ({ int *__result; \ - asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t" \ - "mflr 3\n\t" \ - "addi 3,3," #x "@got@tlsld\n\t" \ - "bl __tls_get_addr@plt\n\t" \ - "addi %0,3," #x "@dtprel" \ - : "=r" (__result) : \ - : __TLS_CALL_CLOBBERS); \ +# else +# define TLS_LD(x) \ + ({ int *__result; \ + asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t" \ + "mflr 3\n\t" \ + "addi 3,3," #x "@got@tlsld\n\t" \ + "bl __tls_get_addr@plt\n\t" \ + "addi %0,3," #x "@dtprel" \ + : "=r" (__result) : \ + : "3", __TLS_CALL_CLOBBERS); \ __result; }) -# endif +# endif /* PowerPC32 General Dynamic TLS access. */ -# ifdef HAVE_ASM_PPC_REL16 -# define TLS_GD(x) \ - ({ register int *__result __asm__ ("r3"); \ - asm ("bcl 20,31,1f\n1:\t" \ - "mflr 3\n\t" \ - "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t" \ - "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t" \ - "addi 3,3," #x "@got@tlsgd\n\t" \ - "bl __tls_get_addr@plt" \ - : : \ - : __TLS_CALL_CLOBBERS); \ +# ifdef HAVE_ASM_PPC_REL16 +# define TLS_GD(x) \ + ({ register int *__result __asm__ ("r3"); \ + asm ("bcl 20,31,1f\n1:\t" \ + "mflr 3\n\t" \ + "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t" \ + "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t" \ + "addi 3,3," #x "@got@tlsgd\n\t" \ + "bl __tls_get_addr@plt" \ + : "=r" (__result) : \ + : __TLS_CALL_CLOBBERS); \ __result; }) -# else -# define TLS_GD(x) \ - ({ register int *__result __asm__ ("r3"); \ - asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t" \ - "mflr 3\n\t" \ - "addi 3,3," #x "@got@tlsgd\n\t" \ - "bl __tls_get_addr@plt" \ - : : \ - : __TLS_CALL_CLOBBERS); \ +# else +# define TLS_GD(x) \ + ({ register int *__result __asm__ ("r3"); \ + asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t" \ + "mflr 3\n\t" \ + "addi 3,3," #x "@got@tlsgd\n\t" \ + "bl __tls_get_addr@plt" \ + : "=r" (__result) : \ + : __TLS_CALL_CLOBBERS); \ __result; }) -# endif +# endif -#elif defined __powerpc__ && defined __powerpc64__ +# else /* PowerPC64 Local Exec TLS access. */ -# define TLS_LE(x) \ - ({ int * __result; \ - asm ( \ - " addis %0,13," #x "@tprel@ha\n" \ - " addi %0,%0," #x "@tprel@l\n" \ - : "=b" (__result) ); \ - __result; \ +# define TLS_LE(x) \ + ({ int * __result; \ + asm ("addis %0,13," #x "@tprel@ha\n\t" \ + "addi %0,%0," #x "@tprel@l" \ + : "=b" (__result) ); \ + __result; \ }) /* PowerPC64 Initial Exec TLS access. */ -# define TLS_IE(x) \ - ({ int * __result; \ - asm ( \ - " ld %0," #x "@got@tprel(2)\n" \ - " add %0,%0," #x "@tls\n" \ - : "=b" (__result) ); \ - __result; \ +# define TLS_IE(x) \ + ({ int * __result; \ + asm ("ld %0," #x "@got@tprel(2)\n\t" \ + "add %0,%0," #x "@tls" \ + : "=r" (__result) ); \ + __result; \ }) -# ifdef HAVE_ASM_GLOBAL_DOT_NAME -# define __TLS_GET_ADDR ".__tls_get_addr" -# else -# define __TLS_GET_ADDR "__tls_get_addr" -# endif +# ifdef HAVE_ASM_GLOBAL_DOT_NAME +# define __TLS_GET_ADDR ".__tls_get_addr" +# else +# define __TLS_GET_ADDR "__tls_get_addr" +# endif /* PowerPC64 Local Dynamic TLS access. */ -# define TLS_LD(x) \ - ({ int * __result; \ - asm ( \ - " addi 3,2," #x "@got@tlsld\n" \ - " bl " __TLS_GET_ADDR "\n" \ - " nop \n" \ - " addis %0,3," #x "@dtprel@ha\n" \ - " addi %0,%0," #x "@dtprel@l\n" \ - : "=b" (__result) : \ - : "0", "3", "4", "5", "6", "7", \ - "8", "9", "10", "11", "12", \ - "lr", "ctr", \ - "cr0", "cr1", "cr5", "cr6", "cr7"); \ - __result; \ +# define TLS_LD(x) \ + ({ int * __result; \ + asm ("addi 3,2," #x "@got@tlsld\n\t" \ + "bl " __TLS_GET_ADDR "\n\t" \ + "nop \n\t" \ + "addis %0,3," #x "@dtprel@ha\n\t" \ + "addi %0,%0," #x "@dtprel@l" \ + : "=b" (__result) : \ + : "3", __TLS_CALL_CLOBBERS); \ + __result; \ }) /* PowerPC64 General Dynamic TLS access. */ -# define TLS_GD(x) \ - ({ int * __result; \ - asm ( \ - " addi 3,2," #x "@got@tlsgd\n" \ - " bl " __TLS_GET_ADDR "\n" \ - " nop \n" \ - " mr %0,3\n" \ - : "=b" (__result) : \ - : "0", "3", "4", "5", "6", "7", \ - "8", "9", "10", "11", "12", \ - "lr", "ctr", \ - "cr0", "cr1", "cr5", "cr6", "cr7"); \ - __result; \ +# define TLS_GD(x) \ + ({ register int *__result __asm__ ("r3"); \ + asm ("addi 3,2," #x "@got@tlsgd\n\t" \ + "bl " __TLS_GET_ADDR "\n\t" \ + "nop " \ + : "=r" (__result) : \ + : __TLS_CALL_CLOBBERS); \ + __result; \ }) +# endif #elif !defined TLS_LE || !defined TLS_IE \ || !defined TLS_LD || !defined TLS_GD From 41288fbb784716ee01e4b6d84299a71081431ccb Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Fri, 12 Feb 2010 07:55:01 -0800 Subject: [PATCH 14/27] Cleanup old obsolete PPC_REL16 checks --- ChangeLog | 37 +++++++++++++++++++ sysdeps/powerpc/powerpc32/configure | 9 ++--- sysdeps/powerpc/powerpc32/configure.in | 4 +- sysdeps/powerpc/powerpc32/dl-machine.h | 7 +--- sysdeps/powerpc/powerpc32/dl-start.S | 5 --- sysdeps/powerpc/powerpc32/elf/start.S | 10 ----- .../powerpc/powerpc32/fpu/__longjmp-common.S | 5 --- sysdeps/powerpc/powerpc32/fpu/s_ceil.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_ceilf.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_floor.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_floorf.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_lround.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_rint.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_rintf.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_round.S | 6 --- sysdeps/powerpc/powerpc32/fpu/s_roundf.S | 6 --- sysdeps/powerpc/powerpc32/fpu/s_trunc.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/s_truncf.S | 7 ---- sysdeps/powerpc/powerpc32/fpu/setjmp-common.S | 5 --- sysdeps/powerpc/powerpc32/memset.S | 7 ---- .../powerpc/powerpc32/power4/fpu/s_llround.S | 6 --- sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S | 7 ---- .../powerpc/powerpc32/power4/fpu/w_sqrtf.S | 7 ---- sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S | 7 ---- .../powerpc/powerpc32/power5/fpu/w_sqrtf.S | 7 ---- .../linux/powerpc/powerpc32/____longjmp_chk.S | 7 ---- .../unix/sysv/linux/powerpc/powerpc32/brk.S | 7 ---- .../powerpc/powerpc32/getcontext-common.S | 5 --- .../powerpc/powerpc32/setcontext-common.S | 5 --- .../powerpc/powerpc32/swapcontext-common.S | 5 --- 30 files changed, 45 insertions(+), 182 deletions(-) diff --git a/ChangeLog b/ChangeLog index 919894ec7e..765b388441 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +2010-02-12 Luis Machado + + * sysdeps/powerpc/powerpc32/dl-machine.h: Removed old PPC_REL16 + check. + * sysdeps/powerpc/powerpc32/dl-machine.h: Likewise. + * sysdeps/powerpc/powerpc32/elf/start.S: Likewise. + * sysdeps/powerpc/powerpc32/memset.S: Likewise. + * sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S: Likewise. + * sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S: Likewise. + * sysdeps/powerpc/powerpc32/configure.in: Fail if R_PPC_REL16 + is not supported. + * sysdeps/powerpc/powerpc32/fpu/s_round.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_truncf.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_floorf.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_ceilf.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_ceil.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_floor.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_roundf.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_rintf.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_trunc.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/setjmp-common.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_lround.S: Likewise. + * sysdeps/powerpc/powerpc32/fpu/s_rint.S: Likewise. + * sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S: Likewise. + * sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S: Likewise. + * sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S: Likewise. + * sysdeps/powerpc/powerpc32/dl-start.S: Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S: Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S: + Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S: + Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S: + Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S: Likewise. + 2010-02-12 Alan Modra * elf/tls-macros.h (__TLS_CALL_CLOBBERS <__powerpc__>): Remove r3. diff --git a/sysdeps/powerpc/powerpc32/configure b/sysdeps/powerpc/powerpc32/configure index 9b76c57886..da8ec0b87c 100644 --- a/sysdeps/powerpc/powerpc32/configure +++ b/sysdeps/powerpc/powerpc32/configure @@ -25,11 +25,10 @@ rm -f conftest* fi { $as_echo "$as_me:$LINENO: result: $libc_cv_ppc_rel16" >&5 $as_echo "$libc_cv_ppc_rel16" >&6; } -if test $libc_cv_ppc_rel16 = yes; then - cat >>confdefs.h <<\_ACEOF -#define HAVE_ASM_PPC_REL16 1 -_ACEOF - +if test $libc_cv_ppc_rel16 = no; then + { { $as_echo "$as_me:$LINENO: error: R_PPC_REL16 is not supported. Binutils is too old." >&5 +$as_echo "$as_me: error: R_PPC_REL16 is not supported. Binutils is too old." >&2;} + { (exit 1); exit 1; }; } fi # See whether GCC uses -msecure-plt. diff --git a/sysdeps/powerpc/powerpc32/configure.in b/sysdeps/powerpc/powerpc32/configure.in index 7219ad993e..21d3f5ee5b 100644 --- a/sysdeps/powerpc/powerpc32/configure.in +++ b/sysdeps/powerpc/powerpc32/configure.in @@ -13,8 +13,8 @@ else libc_cv_ppc_rel16=no fi rm -f conftest*]) -if test $libc_cv_ppc_rel16 = yes; then - AC_DEFINE(HAVE_ASM_PPC_REL16) +if test $libc_cv_ppc_rel16 = no; then + AC_MSG_ERROR(R_PPC_REL16 is not supported. Binutils is too old.) fi # See whether GCC uses -msecure-plt. diff --git a/sysdeps/powerpc/powerpc32/dl-machine.h b/sysdeps/powerpc/powerpc32/dl-machine.h index 6f8d0f506e..5351d9691d 100644 --- a/sysdeps/powerpc/powerpc32/dl-machine.h +++ b/sysdeps/powerpc/powerpc32/dl-machine.h @@ -41,16 +41,13 @@ static inline Elf32_Addr * __attribute__ ((const)) ppc_got (void) { Elf32_Addr *got; -#ifdef HAVE_ASM_PPC_REL16 + asm ("bcl 20,31,1f\n" "1: mflr %0\n" " addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n" " addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n" : "=b" (got) : : "lr"); -#else - asm (" bl _GLOBAL_OFFSET_TABLE_-4@local" - : "=l" (got)); -#endif + return got; } diff --git a/sysdeps/powerpc/powerpc32/dl-start.S b/sysdeps/powerpc/powerpc32/dl-start.S index c77c4de198..ae41f47ede 100644 --- a/sysdeps/powerpc/powerpc32/dl-start.S +++ b/sysdeps/powerpc/powerpc32/dl-start.S @@ -47,15 +47,10 @@ _dl_start_user: passed by value!). */ /* Put our GOT pointer in r31, */ -#ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r31 addis r31,r31,_GLOBAL_OFFSET_TABLE_-1b@ha addi r31,r31,_GLOBAL_OFFSET_TABLE_-1b@l -#else - bl _GLOBAL_OFFSET_TABLE_-4@local - mflr r31 -#endif /* the address of _start in r30, */ mr r30,r3 /* &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28. */ diff --git a/sysdeps/powerpc/powerpc32/elf/start.S b/sysdeps/powerpc/powerpc32/elf/start.S index a8abdca0c6..dc89a5e109 100644 --- a/sysdeps/powerpc/powerpc32/elf/start.S +++ b/sysdeps/powerpc/powerpc32/elf/start.S @@ -53,10 +53,6 @@ L(start_addresses): ASM_SIZE_DIRECTIVE(L(start_addresses)) .section ".text" -#if defined PIC && !defined HAVE_ASM_PPC_REL16 -L(start_addressesp): - .long L(start_addresses)-L(branch) -#endif ENTRY(_start) /* Save the stack pointer, in case we're statically linked under Linux. */ mr r9,r1 @@ -77,16 +73,10 @@ L(branch): start_addresses in r8. Also load the GOT pointer so that new PLT calls work, like the one to __libc_start_main. */ #ifdef PIC -# ifdef HAVE_ASM_PPC_REL16 addis r30,r13,_GLOBAL_OFFSET_TABLE_-L(branch)@ha addis r8,r13,L(start_addresses)-L(branch)@ha addi r30,r30,_GLOBAL_OFFSET_TABLE_-L(branch)@l lwzu r13,L(start_addresses)-L(branch)@l(r8) -# else - lwz r8,L(start_addressesp)-L(branch)(r13) - add r8,r13,r8 - lwz r13,0(r8) -# endif #else lis r8,L(start_addresses)@ha lwzu r13,L(start_addresses)@l(r8) diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S index 04ed6da68b..e1ac064a59 100644 --- a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S +++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S @@ -34,15 +34,10 @@ ENTRY (BP_SYM (__longjmp)) # ifdef PIC mflr r6 cfi_register (lr,r6) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r5 addis r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha addi r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r5 -# endif # ifdef SHARED lwz r5,_rtld_global_ro@got(r5) mtlr r6 diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S index bc74d302fb..80e72ca2bd 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S @@ -31,17 +31,10 @@ ENTRY (__ceil) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S index 47a75ec0c3..ce6d71e4f8 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S @@ -30,17 +30,10 @@ ENTRY (__ceilf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floor.S b/sysdeps/powerpc/powerpc32/fpu/s_floor.S index a29e4791ea..0dd0dbe6c0 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_floor.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_floor.S @@ -31,17 +31,10 @@ ENTRY (__floor) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S index 99fbdc5f86..98a47458bc 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S @@ -30,17 +30,10 @@ ENTRY (__floorf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/fpu/s_lround.S index d73749e134..3bf1ffaea1 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_lround.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_lround.S @@ -45,17 +45,10 @@ ENTRY (__lround) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp10,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp10,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rint.S b/sysdeps/powerpc/powerpc32/fpu/s_rint.S index c8dca313ae..93133718ad 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_rint.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_rint.S @@ -33,17 +33,10 @@ ENTRY (__rint) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S index 7771cb2bc8..1e0fbb1f0d 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S @@ -29,17 +29,10 @@ ENTRY (__rintf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_round.S b/sysdeps/powerpc/powerpc32/fpu/s_round.S index 590c87ad8c..48b346e651 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_round.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_round.S @@ -43,16 +43,10 @@ ENTRY (__round) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha addi r9,r9,.LC0-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) -# endif mtlr r11 cfi_same_value (lr) lfs fp13,0(r9) diff --git a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S index 7e99bca315..88125aad06 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S @@ -42,16 +42,10 @@ ENTRY (__roundf ) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha addi r9,r9,.LC0-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) -# endif mtlr r11 cfi_same_value (lr) lfs fp13,0(r9) diff --git a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S index 5bc0856b9f..c3c021716a 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S @@ -38,17 +38,10 @@ ENTRY (__trunc) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S index e2e3bd6740..eddef070cd 100644 --- a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S +++ b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S @@ -37,17 +37,10 @@ ENTRY (__truncf) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha lfs fp13,.LC0-1b@l(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) - lfs fp13,0(r9) -# endif mtlr r11 cfi_same_value (lr) #else diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S index b7d1abc00d..131e7a332e 100644 --- a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S +++ b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S @@ -85,15 +85,10 @@ ENTRY (BP_SYM (__sigsetjmp)) # ifdef PIC mflr r6 cfi_register(lr,r6) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r5 addis r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha addi r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r5 -# endif mtlr r6 cfi_same_value (lr) # ifdef SHARED diff --git a/sysdeps/powerpc/powerpc32/memset.S b/sysdeps/powerpc/powerpc32/memset.S index 454abb2b65..b4ce218e24 100644 --- a/sysdeps/powerpc/powerpc32/memset.S +++ b/sysdeps/powerpc/powerpc32/memset.S @@ -256,17 +256,10 @@ L(checklinesize): beq L(medium) /* Establishes GOT addressability so we can load __cache_line_size from static. This value was set from the aux vector during startup. */ -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr rGOT addis rGOT,rGOT,__cache_line_size-1b@ha lwz rCLS,__cache_line_size-1b@l(rGOT) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr rGOT - lwz rGOT,__cache_line_size@got(rGOT) - lwz rCLS,0(rGOT) -# endif mtlr rTMP #else /* Load __cache_line_size from static. This value was set from the diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S index e10a37977a..b03e041d8a 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S @@ -53,16 +53,10 @@ ENTRY (__llround) #ifdef SHARED mflr r11 cfi_register(lr,r11) -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r9 addis r9,r9,.LC0-1b@ha addi r9,r9,.LC0-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r10 - lwz r9,.LC0@got(10) -# endif mtlr r11 cfi_same_value (lr) lfd fp9,0(r9) diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S index 95a0b3915d..8be3cf1848 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S @@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S index c31555194b..9fa282c162 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S @@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S index 105b5912a1..27a1a0dcbb 100644 --- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S +++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S @@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S index 14bc0a2ceb..8914855542 100644 --- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S +++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S @@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0) cfi_offset(lr,20-16) cfi_offset(r30,8-16) #ifdef SHARED -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 .LCF1: mflr r30 @@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0) addi r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l lwz r9,_LIB_VERSION@got(30) lwz r0,0(r9) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r30 - lwz r9,_LIB_VERSION@got(30) - lwz r0,0(r9) -# endif #else lis r9,_LIB_VERSION@ha lwz r0,_LIB_VERSION@l(r9) diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S index 4cb968505d..cfd9864f63 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S @@ -28,19 +28,12 @@ #define __longjmp ____longjmp_chk #ifdef PIC -# ifdef HAVE_ASM_PPC_REL16 # define LOAD_ARG \ bcl 20,31,1f; \ 1: mflr r3; \ addis r3,r3,_GLOBAL_OFFSET_TABLE_-1b@ha; \ addi r3,r3,_GLOBAL_OFFSET_TABLE_-1b@l; \ lwz r3,.LC0@got(r3) -# else -# define LOAD_ARG \ - bl _GLOBAL_OFFSET_TABLE_-4@local; \ - mflr r3; \ - lwz r3,.LC0@got(r3) -# endif #else # define LOAD_ARG \ lis r3,.LC0@ha; \ diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S index e945834945..4c8c6b433b 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S @@ -36,17 +36,10 @@ ENTRY (BP_SYM (__brk)) DO_CALL(SYS_ify(brk)) lwz r6,8(r1) #ifdef PIC -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r5 addis r5,r5,__curbrk-1b@ha stw r3,__curbrk-1b@l(r5) -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r5 - lwz r5,__curbrk@got(r5) - stw r3,0(r5) -# endif #else lis r4,__curbrk@ha stw r3,__curbrk@l(r4) diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S index 63e1773e22..27285ed4a5 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S @@ -145,15 +145,10 @@ ENTRY(__CONTEXT_FUNC_NAME) # ifdef __CONTEXT_ENABLE_VRS # ifdef PIC mflr r8 -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r7 addis r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha addi r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r7 -# endif # ifdef SHARED lwz r7,_rtld_global_ro@got(r7) mtlr r8 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S index 127c9e4581..f304090868 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S @@ -73,15 +73,10 @@ ENTRY(__CONTEXT_FUNC_NAME) #ifdef PIC mflr r8 -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r7 addis r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha addi r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r7 -# endif # ifdef SHARED lwz r7,_rtld_global_ro@got(r7) mtlr r8 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S index 89b1a61954..62efee2dce 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S @@ -146,15 +146,10 @@ ENTRY(__CONTEXT_FUNC_NAME) # ifdef PIC mflr r8 -# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f 1: mflr r7 addis r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha addi r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l -# else - bl _GLOBAL_OFFSET_TABLE_@local-4 - mflr r7 -# endif # ifdef SHARED lwz r7,_rtld_global_ro@got(r7) mtlr r8 From 0ab85ce4298875d0dce8bfd4fe2cecd9cda840e3 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 12 Feb 2010 08:04:28 -0800 Subject: [PATCH 15/27] Cleanup ChangeLog. --- ChangeLog | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 765b388441..9c0a82a323 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,6 @@ 2010-02-12 Luis Machado - * sysdeps/powerpc/powerpc32/dl-machine.h: Removed old PPC_REL16 - check. + * sysdeps/powerpc/powerpc32/dl-machine.h: Removed old PPC_REL16 check. * sysdeps/powerpc/powerpc32/dl-machine.h: Likewise. * sysdeps/powerpc/powerpc32/elf/start.S: Likewise. * sysdeps/powerpc/powerpc32/memset.S: Likewise. @@ -26,7 +25,8 @@ * sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S: Likewise. * sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S: Likewise. * sysdeps/powerpc/powerpc32/dl-start.S: Likewise. - * sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S: Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S: + Likewise. * sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S: Likewise. * sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S: @@ -37,12 +37,12 @@ 2010-02-12 Alan Modra - * elf/tls-macros.h (__TLS_CALL_CLOBBERS <__powerpc__>): Remove r3. + * elf/tls-macros.h [__powerpc__] (__TLS_CALL_CLOBBERS): Remove r3. Define and use for __powerpc64__ too. - (TLS_LD <__powerpc__>): Add r3 to clobbers. - (TLS_GD <__powerpc__>): Set asm output. Make __result r3 reg. - (TLS_GD <__powerpc64__>): Make __result r3 reg. - (TLS_IE <__powerpc64__>): Relax output constraint. + [__powerpc__] (TLS_LD): Add r3 to clobbers. + [__powerpc__] (TLS_GD): Set asm output. Make __result r3 reg. + [__powerpc64__] (TLS_GD): Make __result r3 reg. + [__powerpc64__] (TLS_IE): Relax output constraint. 2010-02-11 Andreas Krebbel From 904057bc17fb3e3127a35ebf35fcac8d5bc8269b Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 15 Feb 2010 11:17:50 -0800 Subject: [PATCH 16/27] 32bit memcmp/strcmp/strncmp optimized for SSSE3/SSS4.2 --- ChangeLog | 16 + sysdeps/i386/i686/multiarch/Makefile | 4 +- sysdeps/i386/i686/multiarch/memcmp-sse4.S | 988 ++++++++ sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 1905 ++++++++++++++++ sysdeps/i386/i686/multiarch/memcmp.S | 88 + sysdeps/i386/i686/multiarch/strcmp-sse4.S | 378 ++++ sysdeps/i386/i686/multiarch/strcmp-ssse3.S | 2245 +++++++++++++++++++ sysdeps/i386/i686/multiarch/strcmp.S | 115 + sysdeps/i386/i686/multiarch/strncmp-c.c | 8 + sysdeps/i386/i686/multiarch/strncmp-sse4.S | 5 + sysdeps/i386/i686/multiarch/strncmp-ssse3.S | 5 + sysdeps/i386/i686/multiarch/strncmp.S | 3 + 12 files changed, 5759 insertions(+), 1 deletion(-) create mode 100644 sysdeps/i386/i686/multiarch/memcmp-sse4.S create mode 100644 sysdeps/i386/i686/multiarch/memcmp-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/memcmp.S create mode 100644 sysdeps/i386/i686/multiarch/strcmp-sse4.S create mode 100644 sysdeps/i386/i686/multiarch/strcmp-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/strcmp.S create mode 100644 sysdeps/i386/i686/multiarch/strncmp-c.c create mode 100644 sysdeps/i386/i686/multiarch/strncmp-sse4.S create mode 100644 sysdeps/i386/i686/multiarch/strncmp-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/strncmp.S diff --git a/ChangeLog b/ChangeLog index 9c0a82a323..6ef8d4c9dc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2010-02-12 H.J. Lu + + * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add + strcmp-ssse3, strcmp-sse4, strncmp-c, strncmp-ssse3, strncmp-sse4, + memcmp-c, memcmp-ssse3, and memcmp-sse4. + * sysdeps/i386/i686/multiarch/memcmp-sse4.S: New file. + * sysdeps/i386/i686/multiarch/memcmp-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/memcmp.S: New file. + * sysdeps/i386/i686/multiarch/strcmp-sse4.S: New file. + * sysdeps/i386/i686/multiarch/strcmp-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/strcmp.S: New file. + * sysdeps/i386/i686/multiarch/strncmp-c.c: New file. + * sysdeps/i386/i686/multiarch/strncmp-sse4.S: New file. + * sysdeps/i386/i686/multiarch/strncmp-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/strncmp.S: New file. + 2010-02-12 Luis Machado * sysdeps/powerpc/powerpc32/dl-machine.h: Removed old PPC_REL16 check. diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index fbad9ae734..e8847d6fc4 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -7,7 +7,9 @@ ifeq ($(subdir),string) sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ - memset-sse2-rep bzero-sse2-rep + memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ + strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ + memcmp-ssse3 memcmp-sse4 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..06437e484c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -0,0 +1,988 @@ +/* memcmp with SSE4.2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include +#include "asm-syntax.h" + +#ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +#ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table and adjuested EDX/ESI. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + cmp $1, %ecx + jbe L(less1bytes) + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + PUSH (%ebx) + jb L(less8bytes) + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) +L(less8bytes): + + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(0bytesend): + xor %eax, %eax + ret + + ALIGN (4) +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + jmp L(16bytes) + ALIGN (4) + +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + ALIGN (4) +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + + +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN + +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN + + ALIGN (4) +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret + + ALIGN (2) +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + +END (MEMCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..bfcf660729 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -0,0 +1,1905 @@ +/* memcmp with SSSE3 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include +#include "asm-syntax.h" + +#ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +#define RETURN RETURN_END; CFI_PUSH (%ebx); CFI_PUSH (%edi); \ + CFI_PUSH (%esi) + + .section .text.ssse3,"ax",@progbits +ENTRY (MEMCMP) + movl LEN(%esp), %ecx + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + cmp $1, %ecx + jbe L(less1bytes) + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + ALIGN (4) +L(less1bytes): + jb L(zero) + movb (%eax), %cl + cmp (%edx), %cl + je L(zero) + mov $1, %eax + ja L(1bytesend) + neg %eax +L(1bytesend): + ret + + ALIGN (4) +L(zero): + mov $0, %eax + ret + + ALIGN (4) +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + ALIGN (4) +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) + + ALIGN (4) +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_1): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $1,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_1_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $1,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $1,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $1,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $1,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_1_gobble_loop) + + cmp $0, %ecx + jge L(shr_1_gobble_next) + inc %edx + add $32, %ecx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + + ALIGN (4) +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_3): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $3,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_3_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $3,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $3,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $3,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $3,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_3_gobble_loop) + + cmp $0, %ecx + jge L(shr_3_gobble_next) + inc %edx + add $32, %ecx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_5): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $5,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_5_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $5,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $5,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $5,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $5,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_5_gobble_loop) + + cmp $0, %ecx + jge L(shr_5_gobble_next) + inc %edx + add $32, %ecx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_7): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $7,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_7_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $7,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $7,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $7,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $7,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_7_gobble_loop) + + cmp $0, %ecx + jge L(shr_7_gobble_next) + inc %edx + add $32, %ecx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_9): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $9,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_9_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $9,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $9,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $9,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $9,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_9_gobble_loop) + + cmp $0, %ecx + jge L(shr_9_gobble_next) + inc %edx + add $32, %ecx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_11): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_11_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $11, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $11, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $11,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $11,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_11_gobble_loop) + + cmp $0, %ecx + jge L(shr_11_gobble_next) + inc %edx + add $32, %ecx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_13): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_13_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $13, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $13, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $13,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $13,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_13_gobble_loop) + + cmp $0, %ecx + jge L(shr_13_gobble_next) + inc %edx + add $32, %ecx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_15): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(shr_15_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $15, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $15, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $15,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $15,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_15_gobble_loop) + + cmp $0, %ecx + jge L(shr_15_gobble_next) + inc %edx + add $32, %ecx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + ALIGN (4) +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx +L(first16bytes): + add %eax, %esi +L(less16bytes): + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) +L(Byte23): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte16): + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte17): + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte18): + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte19): + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte20): + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte21): + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(Byte22): + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(next_24_bytes): + lea 8(%edi), %edi + lea 8(%esi), %esi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + ALIGN (4) +L(Byte31): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + ALIGN (4) +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) + + ALIGN (4) +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) + + ALIGN (4) +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) + + ALIGN (4) +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) + + ALIGN (4) +L(more40bytes): + cmp $40, %ecx + je L(40bytes) + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + + ALIGN (4) +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) + + + + + ALIGN (4) +L(44bytes): + mov -44(%eax), %ecx + mov -44(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + mov -40(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + mov -36(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + mov -32(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + mov -28(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + mov -24(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + mov -20(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(45bytes): + mov -45(%eax), %ecx + mov -45(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(41bytes): + mov -41(%eax), %ecx + mov -41(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(37bytes): + mov -37(%eax), %ecx + mov -37(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(33bytes): + mov -33(%eax), %ecx + mov -33(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(29bytes): + mov -29(%eax), %ecx + mov -29(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(25bytes): + mov -25(%eax), %ecx + mov -25(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(21bytes): + mov -21(%eax), %ecx + mov -21(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(46bytes): + mov -46(%eax), %ecx + mov -46(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(42bytes): + mov -42(%eax), %ecx + mov -42(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(38bytes): + mov -38(%eax), %ecx + mov -38(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(34bytes): + mov -34(%eax), %ecx + mov -34(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(30bytes): + mov -30(%eax), %ecx + mov -30(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(26bytes): + mov -26(%eax), %ecx + mov -26(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(22bytes): + mov -22(%eax), %ecx + mov -22(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(47bytes): + movl -47(%eax), %ecx + movl -47(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(43bytes): + movl -43(%eax), %ecx + movl -43(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(39bytes): + movl -39(%eax), %ecx + movl -39(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(35bytes): + movl -35(%eax), %ecx + movl -35(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(31bytes): + movl -31(%eax), %ecx + movl -31(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(27bytes): + movl -27(%eax), %ecx + movl -27(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(23bytes): + movl -23(%eax), %ecx + movl -23(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + ALIGN (4) +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret + +END (MEMCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S new file mode 100644 index 0000000000..fa7c52a003 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcmp.S @@ -0,0 +1,88 @@ +/* Multiple versions of memcmp + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memcmp_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcmp_ssse3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcmp_sse4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(memcmp) +# else + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __memcmp_ia32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __memcmp_ssse3, %eax + testl $bit_SSE4_2, FEATURE_OFFSET+index_SSE4_2+__cpu_features + jz 2f + leal __memcmp_sse4_2, %eax +2: ret +END(memcmp) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_ia32, @function; \ + .p2align 4; \ + __memcmp_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32 +# endif +#endif + +#include "../memcmp.S" diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S new file mode 100644 index 0000000000..977647203f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -0,0 +1,378 @@ +/* strcmp with SSE4.2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifndef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strcmp_sse4_2 +# endif +# define STR1 4 +# define STR2 STR1+4 +#else +# ifndef STRCMP +# define STRCMP __strncmp_sse4_2 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +#endif + + .section .text.sse4.2,"ax",@progbits +ENTRY (STRCMP) +#ifdef USE_AS_STRNCMP + PUSH (%ebp) +#endif + mov STR1(%esp), %edx + mov STR2(%esp), %eax +#ifdef USE_AS_STRNCMP + movl CNT(%esp), %ebp + test %ebp, %ebp + je L(eq) +#endif + mov %dx, %cx + and $0xfff, %cx + cmp $0xff0, %cx + ja L(first4bytes) + movdqu (%edx), %xmm2 + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(first4bytes) + movd %xmm2, %ecx + cmp (%eax), %ecx + jne L(less4bytes) + movdqu (%eax), %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm0 + ptest %xmm1, %xmm0 + jnc L(less16bytes) + pcmpeqb %xmm0, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + +#ifdef USE_AS_STRNCMP + sub $16, %ebp + jbe L(eq) +#endif + add $16, %edx + add $16, %eax +L(first4bytes): + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + je L(eq) +#endif + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + je L(eq) +#endif + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + je L(eq) +#endif + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + je L(eq) +#endif + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + je L(eq) +#endif + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + je L(eq) +#endif + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + je L(eq) +#endif + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + sub $8, %ebp + je L(eq) +#endif + add $8, %eax + add $8, %edx + + PUSH (%ebx) + PUSH (%edi) + PUSH (%esi) + mov %edx, %edi + mov %eax, %esi + xorl %eax, %eax +L(check_offset): + movl %edi, %ebx + movl %esi, %ecx + andl $0xfff, %ebx + andl $0xfff, %ecx + cmpl %ebx, %ecx + cmovl %ebx, %ecx + lea -0xff0(%ecx), %edx + sub %edx, %edi + sub %edx, %esi + testl %edx, %edx + jg L(crosspage) +L(loop): + movdqu (%esi,%edx), %xmm2 + movdqu (%edi,%edx), %xmm1 + pcmpistri $0x1a, %xmm2, %xmm1 + jbe L(end) + +#ifdef USE_AS_STRNCMP + sub $16, %ebp + jbe L(more16byteseq) +#endif + + add $16, %edx + jle L(loop) +L(crosspage): + movzbl (%edi,%edx), %eax + movzbl (%esi,%edx), %ebx + subl %ebx, %eax + jne L(ret) + testl %ebx, %ebx + je L(ret) +#ifdef USE_AS_STRNCMP + sub $1, %ebp + jbe L(more16byteseq) +#endif + inc %edx + cmp $15, %edx + jle L(crosspage) + add $16, %edi + add $16, %esi + jmp L(check_offset) + +L(end): + jnc L(ret) +#ifdef USE_AS_STRNCMP + sub %ecx, %ebp + jbe L(more16byteseq) +#endif + lea (%ecx,%edx), %ebx + movzbl (%edi,%ebx), %eax + movzbl (%esi,%ebx), %ecx + subl %ecx, %eax +L(ret): + POP (%esi) + POP (%edi) + POP (%ebx) +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + +#ifdef USE_AS_STRNCMP +L(more16byteseq): + POP (%esi) + POP (%edi) + POP (%ebx) +#endif +L(eq): + xorl %eax, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + .p2align 4 +L(less16bytes): + add $0xfefefeff, %ecx + jnc L(less4bytes) + xor (%edx), %ecx + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(less4bytes) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(eq) +#endif + mov 4(%edx), %ecx + cmp 4(%eax), %ecx + jne L(more4bytes) + add $0xfefefeff, %ecx + jnc L(more4bytes) + xor 4(%edx), %ecx + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(more4bytes) + +#ifdef USE_AS_STRNCMP + sub $8, %ebp + jbe L(eq) +#endif + + add $8, %edx + add $8, %eax +L(less4bytes): + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + je L(eq) +#endif + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + je L(eq) +#endif + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + je L(eq) +#endif + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +L(more4bytes): +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + je L(eq) +#endif + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + je L(eq) +#endif + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + je L(eq) +#endif + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + je L(eq) +#endif + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + +END (STRCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..14caae29a1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -0,0 +1,2245 @@ +/* strcmp with SSSE3 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifndef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strcmp_ssse3 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define UPDATE_STRNCMP_COUNTER +#else +# ifndef STRCMP +# define STRCMP __strncmp_ssse3 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 + +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, %ebp; \ + jbe L(more8byteseq); \ + sub %esi, %ebp +#endif + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCMP) +#ifdef USE_AS_STRNCMP + PUSH (%ebp) +#endif + movl STR1(%esp), %edx + movl STR2(%esp), %eax +#ifdef USE_AS_STRNCMP + movl CNT(%esp), %ebp + cmp $16, %ebp + jb L(less16bytes_sncmp) + jmp L(more16bytes) +#endif + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + add $8, %edx + add $8, %eax +#ifdef USE_AS_STRNCMP + cmp $8, %ebp + lea -8(%ebp), %ebp + je L(eq) +L(more16bytes): +#endif + movl %edx, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + pxor %xmm0, %xmm0 + movlpd (%eax), %xmm1 + movlpd (%edx), %xmm2 + movhpd 8(%eax), %xmm1 + movhpd 8(%edx), %xmm2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %ecx + sub $0xffff, %ecx + jnz L(less16bytes) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(eq) +#endif + add $16, %eax + add $16, %edx + +L(crosspage): + + PUSH (%ebx) + PUSH (%edi) + PUSH (%esi) + + movl %edx, %edi + movl %eax, %ecx + and $0xf, %ecx + and $0xf, %edi + xor %ecx, %eax + xor %edi, %edx + xor %ebx, %ebx + cmp %edi, %ecx + je L(ashr_0) + ja L(bigger) + or $0x20, %ebx + xchg %edx, %eax + xchg %ecx, %edi +L(bigger): + lea 15(%edi), %edi + sub %ecx, %edi + cmp $8, %edi + jle L(ashr_less_8) + cmp $14, %edi + je L(ashr_15) + cmp $13, %edi + je L(ashr_14) + cmp $12, %edi + je L(ashr_13) + cmp $11, %edi + je L(ashr_12) + cmp $10, %edi + je L(ashr_11) + cmp $9, %edi + je L(ashr_10) +L(ashr_less_8): + je L(ashr_9) + cmp $7, %edi + je L(ashr_8) + cmp $6, %edi + je L(ashr_7) + cmp $5, %edi + je L(ashr_6) + cmp $4, %edi + je L(ashr_5) + cmp $3, %edi + je L(ashr_4) + cmp $2, %edi + je L(ashr_3) + cmp $1, %edi + je L(ashr_2) + cmp $0, %edi + je L(ashr_1) + +/* + * The following cases will be handled by ashr_0 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +L(ashr_0): + mov $0xffff, %esi + movdqa (%eax), %xmm1 + pxor %xmm0, %xmm0 + pcmpeqb %xmm1, %xmm0 + pcmpeqb (%edx), %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + mov %ecx, %edi + jne L(less32bytes) + UPDATE_STRNCMP_COUNTER + mov $0x10, %ebx + mov $0x10, %ecx + pxor %xmm0, %xmm0 + .p2align 4 +L(loop_ashr_0): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + jmp L(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +L(ashr_1): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $15, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -15(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $1, %ebx + lea 1(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_1): + add $16, %edi + jg L(nibble_ashr_1) + +L(gobble_ashr_1): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_1) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_1) + + .p2align 4 +L(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffe, %esi + jnz L(ashr_1_exittail) + +#ifdef USE_AS_STRNCMP + cmp $15, %ebp + jbe L(ashr_1_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_1) + + .p2align 4 +L(ashr_1_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_2 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +L(ashr_2): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -14(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $2, %ebx + lea 2(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_2): + add $16, %edi + jg L(nibble_ashr_2) + +L(gobble_ashr_2): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_2) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_2) + + .p2align 4 +L(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffc, %esi + jnz L(ashr_2_exittail) + +#ifdef USE_AS_STRNCMP + cmp $14, %ebp + jbe L(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_2) + + .p2align 4 +L(ashr_2_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_3 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +L(ashr_3): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -13(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $3, %ebx + lea 3(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_3): + add $16, %edi + jg L(nibble_ashr_3) + +L(gobble_ashr_3): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_3) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_3) + + .p2align 4 +L(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff8, %esi + jnz L(ashr_3_exittail) + +#ifdef USE_AS_STRNCMP + cmp $13, %ebp + jbe L(ashr_3_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_3) + + .p2align 4 +L(ashr_3_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_4 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +L(ashr_4): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -12(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $4, %ebx + lea 4(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_4): + add $16, %edi + jg L(nibble_ashr_4) + +L(gobble_ashr_4): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_4) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_4) + + .p2align 4 +L(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff0, %esi + jnz L(ashr_4_exittail) + +#ifdef USE_AS_STRNCMP + cmp $12, %ebp + jbe L(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_4) + + .p2align 4 +L(ashr_4_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_5 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(11~15) n -11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +L(ashr_5): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -11(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $5, %ebx + lea 5(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_5): + add $16, %edi + jg L(nibble_ashr_5) + +L(gobble_ashr_5): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_5) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_5) + + .p2align 4 +L(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffe0, %esi + jnz L(ashr_5_exittail) + +#ifdef USE_AS_STRNCMP + cmp $11, %ebp + jbe L(ashr_5_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_5) + + .p2align 4 +L(ashr_5_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_6 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(10~15) n -10 5(15 +(n-10) - n) ashr_6 + */ + + .p2align 4 +L(ashr_6): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -10(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $6, %ebx + lea 6(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_6): + add $16, %edi + jg L(nibble_ashr_6) + +L(gobble_ashr_6): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_6) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_6) + + .p2align 4 +L(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffc0, %esi + jnz L(ashr_6_exittail) + +#ifdef USE_AS_STRNCMP + cmp $10, %ebp + jbe L(ashr_6_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_6) + + .p2align 4 +L(ashr_6_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_7 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7 + */ + + .p2align 4 +L(ashr_7): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -9(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $7, %ebx + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_7): + add $16, %edi + jg L(nibble_ashr_7) + +L(gobble_ashr_7): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_7) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_7) + + .p2align 4 +L(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff80, %esi + jnz L(ashr_7_exittail) + +#ifdef USE_AS_STRNCMP + cmp $9, %ebp + jbe L(ashr_7_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_7) + + .p2align 4 +L(ashr_7_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_8 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8 + */ + .p2align 4 +L(ashr_8): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -8(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $8, %ebx + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_8): + add $16, %edi + jg L(nibble_ashr_8) + +L(gobble_ashr_8): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_8) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_8) + + .p2align 4 +L(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff00, %esi + jnz L(ashr_8_exittail) + +#ifdef USE_AS_STRNCMP + cmp $8, %ebp + jbe L(ashr_8_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_8) + + .p2align 4 +L(ashr_8_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_9 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9 + */ + .p2align 4 +L(ashr_9): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -7(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $9, %ebx + lea 9(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_9): + add $16, %edi + jg L(nibble_ashr_9) + +L(gobble_ashr_9): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_9) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_9) + + .p2align 4 +L(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfe00, %esi + jnz L(ashr_9_exittail) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + jbe L(ashr_9_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_9) + + .p2align 4 +L(ashr_9_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_10 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10 + */ + .p2align 4 +L(ashr_10): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -6(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $10, %ebx + lea 10(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_10): + add $16, %edi + jg L(nibble_ashr_10) + +L(gobble_ashr_10): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_10) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_10) + + .p2align 4 +L(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfc00, %esi + jnz L(ashr_10_exittail) + +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + jbe L(ashr_10_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_10) + + .p2align 4 +L(ashr_10_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_11 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11 + */ + .p2align 4 +L(ashr_11): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -5(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $11, %ebx + lea 11(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_11): + add $16, %edi + jg L(nibble_ashr_11) + +L(gobble_ashr_11): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_11) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_11) + + .p2align 4 +L(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf800, %esi + jnz L(ashr_11_exittail) + +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + jbe L(ashr_11_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_11) + + .p2align 4 +L(ashr_11_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_12 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12 + */ + .p2align 4 +L(ashr_12): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -4(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $12, %ebx + lea 12(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_12): + add $16, %edi + jg L(nibble_ashr_12) + +L(gobble_ashr_12): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_12) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_12) + + .p2align 4 +L(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf000, %esi + jnz L(ashr_12_exittail) + +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(ashr_12_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_12) + + .p2align 4 +L(ashr_12_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_13 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13 + */ + .p2align 4 +L(ashr_13): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -3(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $13, %ebx + lea 13(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_13): + add $16, %edi + jg L(nibble_ashr_13) + +L(gobble_ashr_13): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_13) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_13) + + .p2align 4 +L(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xe000, %esi + jnz L(ashr_13_exittail) + +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + jbe L(ashr_13_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_13) + + .p2align 4 +L(ashr_13_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14 + */ + .p2align 4 +L(ashr_14): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -2(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $14, %ebx + lea 14(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_14): + add $16, %edi + jg L(nibble_ashr_14) + +L(gobble_ashr_14): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_14) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_14) + + .p2align 4 +L(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xc000, %esi + jnz L(ashr_14_exittail) + +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + jbe L(ashr_14_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_14) + + .p2align 4 +L(ashr_14_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15 + */ + + .p2align 4 +L(ashr_15): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -1(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + or $15, %ebx + lea 15(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_15): + add $16, %edi + jg L(nibble_ashr_15) + +L(gobble_ashr_15): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_15) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_15) + + .p2align 4 +L(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0x8000, %esi + jnz L(ashr_15_exittail) + +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + jbe L(ashr_15_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_15) + + .p2align 4 +L(ashr_15_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $15, %xmm0 + psrldq $15, %xmm3 + jmp L(aftertail) + + .p2align 4 +L(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + not %esi +L(exit): + mov %ebx, %edi + and $0x1f, %edi + lea -16(%edi, %ecx), %edi +L(less32bytes): + add %edi, %edx + add %ecx, %eax + test $0x20, %ebx + jz L(ret2) + xchg %eax, %edx + + .p2align 4 +L(ret2): + mov %esi, %ecx + POP (%esi) + POP (%edi) + POP (%ebx) +L(less16bytes): + test %cl, %cl + jz L(2next_8_bytes) + + test $0x01, %cl + jnz L(Byte0) + + test $0x02, %cl + jnz L(Byte1) + + test $0x04, %cl + jnz L(Byte2) + + test $0x08, %cl + jnz L(Byte3) + + test $0x10, %cl + jnz L(Byte4) + + test $0x20, %cl + jnz L(Byte5) + + test $0x40, %cl + jnz L(Byte6) +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + jbe L(eq) +#endif + + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(Byte0): +#ifdef USE_AS_STRNCMP + cmp $0, %ebp + jbe L(eq) +#endif + movzx (%eax), %ecx + movzx (%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(Byte1): +#ifdef USE_AS_STRNCMP + cmp $1, %ebp + jbe L(eq) +#endif + movzx 1(%eax), %ecx + movzx 1(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(Byte2): +#ifdef USE_AS_STRNCMP + cmp $2, %ebp + jbe L(eq) +#endif + movzx 2(%eax), %ecx + movzx 2(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(Byte3): +#ifdef USE_AS_STRNCMP + cmp $3, %ebp + jbe L(eq) +#endif + movzx 3(%eax), %ecx + movzx 3(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(Byte4): +#ifdef USE_AS_STRNCMP + cmp $4, %ebp + jbe L(eq) +#endif + movzx 4(%eax), %ecx + movzx 4(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + .p2align 4 +L(Byte5): +#ifdef USE_AS_STRNCMP + cmp $5, %ebp + jbe L(eq) +#endif + movzx 5(%eax), %ecx + movzx 5(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(Byte6): +#ifdef USE_AS_STRNCMP + cmp $6, %ebp + jbe L(eq) +#endif + movzx 6(%eax), %ecx + movzx 6(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + + .p2align 4 +L(2next_8_bytes): + add $8, %eax + add $8, %edx +#ifdef USE_AS_STRNCMP + cmp $8, %ebp + lea -8(%ebp), %ebp + jbe L(eq) +#endif + + test $0x01, %ch + jnz L(Byte0) + + test $0x02, %ch + jnz L(Byte1) + + test $0x04, %ch + jnz L(Byte2) + + test $0x08, %ch + jnz L(Byte3) + + test $0x10, %ch + jnz L(Byte4) + + test $0x20, %ch + jnz L(Byte5) + + test $0x40, %ch + jnz L(Byte6) + +#ifdef USE_AS_STRNCMP + cmp $7, %ebp + jbe L(eq) +#endif + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + + sub %ecx, %eax +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + ret + +#ifdef USE_AS_STRNCMP +L(more8byteseq): + POP (%esi) + POP (%edi) + POP (%ebx) +#endif + +L(eq): + +#ifdef USE_AS_STRNCMP + POP (%ebp) +#endif + xorl %eax, %eax + ret +#ifdef USE_AS_STRNCMP +L(less16bytes_sncmp): + test %ebp, %ebp + jz L(eq) + + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $1, %ebp + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $2, %ebp + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $3, %ebp + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $4, %ebp + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $5, %ebp + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $6, %ebp + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $7, %ebp + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + + cmp $8, %ebp + je L(eq) + + movzbl 8(%eax), %ecx + cmpb %cl, 8(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $9, %ebp + je L(eq) + + movzbl 9(%eax), %ecx + cmpb %cl, 9(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $10, %ebp + je L(eq) + + movzbl 10(%eax), %ecx + cmpb %cl, 10(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $11, %ebp + je L(eq) + + movzbl 11(%eax), %ecx + cmpb %cl, 11(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + + cmp $12, %ebp + je L(eq) + + movzbl 12(%eax), %ecx + cmpb %cl, 12(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $13, %ebp + je L(eq) + + movzbl 13(%eax), %ecx + cmpb %cl, 13(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $14, %ebp + je L(eq) + + movzbl 14(%eax), %ecx + cmpb %cl, 14(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + cmp $15, %ebp + je L(eq) + + movzbl 15(%eax), %ecx + cmpb %cl, 15(%edx) + jne L(neq) + test %cl, %cl + je L(eq) + + POP (%ebp) + xor %eax, %eax + ret +#endif + +END (STRCMP) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S new file mode 100644 index 0000000000..79a1fdfd43 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcmp.S @@ -0,0 +1,115 @@ +/* Multiple versions of strcmp + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#ifndef USE_AS_STRNCMP +# define STRCMP strcmp +# define __GI_STRCMP __GI_strcmp +# define __STRCMP_IA32 __strcmp_ia32 +# define __STRCMP_SSSE3 __strcmp_ssse3 +# define __STRCMP_SSE4_2 __strcmp_sse4_2 +#else +# define STRCMP strncmp +# define __GI_STRCMP __GI_strncmp +# define __STRCMP_IA32 __strncmp_ia32 +# define __STRCMP_SSSE3 __strncmp_ssse3 +# define __STRCMP_SSE4_2 __strncmp_sse4_2 +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __STRCMP_IA32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __STRCMP_SSSE3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __STRCMP_SSE4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(STRCMP) +# else + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __STRCMP_IA32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __STRCMP_SSSE3, %eax + testl $bit_SSE4_2, FEATURE_OFFSET+index_SSE4_2+__cpu_features + jz 2f + leal __STRCMP_SSE4_2, %eax +2: ret +END(STRCMP) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __STRCMP_IA32, @function; \ + .p2align 4; \ + __STRCMP_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32 +# endif +#endif + +#ifndef USE_AS_STRNCMP +# include "../strcmp.S" +#endif diff --git a/sysdeps/i386/i686/multiarch/strncmp-c.c b/sysdeps/i386/i686/multiarch/strncmp-c.c new file mode 100644 index 0000000000..cc059da494 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +# define STRNCMP __strncmp_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32); +#endif + +#include "string/strncmp.c" diff --git a/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/sysdeps/i386/i686/multiarch/strncmp-sse4.S new file mode 100644 index 0000000000..cf14dfaf6c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp-sse4.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_sse4_2 +# include "strcmp-sse4.S" +#endif diff --git a/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..536c8685f2 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_ssse3 +# include "strcmp-ssse3.S" +#endif diff --git a/sysdeps/i386/i686/multiarch/strncmp.S b/sysdeps/i386/i686/multiarch/strncmp.S new file mode 100644 index 0000000000..b6814315fb --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncmp.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCMP +#define STRCMP strncmp +#include "strcmp.S" From 6bb74d9f86e543c418f94a7732e8ee47c9e8225f Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Mon, 15 Feb 2010 13:04:54 -0800 Subject: [PATCH 17/27] Fix up new x86 string functions. --- ChangeLog | 10 +++ sysdeps/i386/i686/multiarch/memcmp-sse4.S | 36 +++++------ sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 75 ++++++++++++++++++++-- sysdeps/i386/i686/multiarch/strcmp-sse4.S | 14 ++++ sysdeps/i386/i686/multiarch/strcmp-ssse3.S | 64 +++++++++++++----- 5 files changed, 161 insertions(+), 38 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6ef8d4c9dc..1595a0a55e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2010-02-15 Ulrich Drepper + + * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Fix unwind info. + * sysdeps/i386/i686/multiarch/memcmp-ssse3.S: Likewise. + * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Likewise. + * sysdeps/i386/i686/multiarch/strcmp-ssse3.S: Likewise. + + * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Don't fall through to + undefined code. + 2010-02-12 H.J. Lu * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S index 06437e484c..71c4e1c337 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -105,43 +105,43 @@ L(less8bytes): mov 1(%eax), %bl cmpb 1(%edx), %bl jne L(nonzero) - - cmp $2, %ecx + + cmp $2, %ecx jz L(0bytes) mov 2(%eax), %bl cmpb 2(%edx), %bl jne L(nonzero) - - cmp $3, %ecx + + cmp $3, %ecx jz L(0bytes) - + mov 3(%eax), %bl cmpb 3(%edx), %bl jne L(nonzero) - - cmp $4, %ecx + + cmp $4, %ecx jz L(0bytes) - + mov 4(%eax), %bl cmpb 4(%edx), %bl jne L(nonzero) - cmp $5, %ecx + cmp $5, %ecx jz L(0bytes) - + mov 5(%eax), %bl cmpb 5(%edx), %bl jne L(nonzero) - cmp $6, %ecx + cmp $6, %ecx jz L(0bytes) - + mov 6(%eax), %bl cmpb 6(%edx), %bl je L(0bytes) L(nonzero): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(above) neg %eax @@ -151,11 +151,11 @@ L(above): ALIGN (4) L(0bytes): - POP (%ebx) + POP (%ebx) xor %eax, %eax ret CFI_PUSH (%ebx) - + ALIGN (4) L(less1bytes): jb L(0bytesend) @@ -609,7 +609,7 @@ L(26bytes): mov -6(%edx), %ebx cmp %ebx, %ecx jne L(find_diff) - + movzwl -2(%eax), %ecx movzwl -2(%edx), %ebx cmp %bl, %cl @@ -873,7 +873,7 @@ L(32bytes): L(less16bytes): add %ebx, %eax add %ebx, %edx - + mov (%eax), %ecx mov (%edx), %ebx cmp %ebx, %ecx @@ -908,7 +908,7 @@ L(find_diff): jne L(end) cmp %bx, %cx L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S index bfcf660729..869f37a912 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -43,8 +43,7 @@ #define BLK2 BLK1+4 #define LEN BLK2+4 #define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; CFI_PUSH (%ebx); CFI_PUSH (%edi); \ - CFI_PUSH (%esi) +#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state .section .text.ssse3,"ax",@progbits ENTRY (MEMCMP) @@ -76,12 +75,13 @@ L(1bytesend): L(zero): mov $0, %eax ret - + ALIGN (4) L(48bytesormore): PUSH (%ebx) PUSH (%esi) PUSH (%edi) + cfi_remember_state movdqu (%eax), %xmm3 movdqu (%edx), %xmm0 movl %eax, %edi @@ -155,7 +155,7 @@ L(shr_0): add $32, %esi sub $0xffff, %edx jnz L(exit) - + lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx @@ -163,6 +163,8 @@ L(shr_0): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_0_gobble): lea -48(%ecx), %ecx @@ -207,6 +209,8 @@ L(shr_0_gobble_loop_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_1): cmp $80, %ecx @@ -235,6 +239,8 @@ L(shr_1): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_1_gobble): sub $32, %ecx @@ -286,6 +292,8 @@ L(shr_1_gobble_next): jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_2): cmp $80, %ecx @@ -314,6 +322,8 @@ L(shr_2): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_2_gobble): sub $32, %ecx @@ -364,6 +374,8 @@ L(shr_2_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_3): cmp $80, %ecx @@ -392,6 +404,8 @@ L(shr_3): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_3_gobble): sub $32, %ecx @@ -442,6 +456,8 @@ L(shr_3_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_4): cmp $80, %ecx @@ -470,6 +486,8 @@ L(shr_4): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_4_gobble): sub $32, %ecx @@ -520,6 +538,8 @@ L(shr_4_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_5): cmp $80, %ecx @@ -548,6 +568,8 @@ L(shr_5): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_5_gobble): sub $32, %ecx @@ -598,6 +620,8 @@ L(shr_5_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_6): cmp $80, %ecx @@ -626,6 +650,8 @@ L(shr_6): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_6_gobble): sub $32, %ecx @@ -676,6 +702,8 @@ L(shr_6_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_7): cmp $80, %ecx @@ -704,6 +732,8 @@ L(shr_7): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_7_gobble): sub $32, %ecx @@ -754,6 +784,8 @@ L(shr_7_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_8): cmp $80, %ecx @@ -782,6 +814,8 @@ L(shr_8): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_8_gobble): sub $32, %ecx @@ -832,6 +866,8 @@ L(shr_8_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_9): cmp $80, %ecx @@ -860,6 +896,8 @@ L(shr_9): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_9_gobble): sub $32, %ecx @@ -910,6 +948,8 @@ L(shr_9_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_10): cmp $80, %ecx @@ -938,6 +978,8 @@ L(shr_10): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_10_gobble): sub $32, %ecx @@ -988,6 +1030,8 @@ L(shr_10_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_11): cmp $80, %ecx @@ -1016,6 +1060,8 @@ L(shr_11): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_11_gobble): sub $32, %ecx @@ -1066,6 +1112,8 @@ L(shr_11_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_12): cmp $80, %ecx @@ -1094,6 +1142,8 @@ L(shr_12): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_12_gobble): sub $32, %ecx @@ -1144,6 +1194,8 @@ L(shr_12_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_13): cmp $80, %ecx @@ -1172,6 +1224,8 @@ L(shr_13): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_13_gobble): sub $32, %ecx @@ -1222,6 +1276,8 @@ L(shr_13_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_14): cmp $80, %ecx @@ -1250,6 +1306,8 @@ L(shr_14): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_14_gobble): sub $32, %ecx @@ -1300,6 +1358,8 @@ L(shr_14_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_15): cmp $80, %ecx @@ -1328,6 +1388,8 @@ L(shr_15): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shr_15_gobble): sub $32, %ecx @@ -1378,6 +1440,8 @@ L(shr_15_gobble_next): POP (%esi) jmp L(less48bytes) + cfi_restore_state + cfi_remember_state ALIGN (4) L(exit): pmovmskb %xmm1, %ebx @@ -1497,8 +1561,9 @@ L(Byte31): movzbl -9(%edi), %eax movzbl -9(%esi), %edx sub %edx, %eax - RETURN + RETURN_END + CFI_PUSH (%ebx) ALIGN (4) L(more8bytes): cmp $16, %ecx diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index 977647203f..4b47851ed4 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -176,6 +176,7 @@ L(first4bytes): PUSH (%ebx) PUSH (%edi) PUSH (%esi) + cfi_remember_state mov %edx, %edi mov %eax, %esi xorl %eax, %eax @@ -241,6 +242,7 @@ L(ret): #endif ret + cfi_restore_state #ifdef USE_AS_STRNCMP L(more16byteseq): POP (%esi) @@ -253,6 +255,10 @@ L(eq): POP (%ebp) #endif ret + +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(neq): mov $1, %eax ja L(neq_bigger) @@ -263,6 +269,9 @@ L(neq_bigger): #endif ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(less16bytes): add $0xfefefeff, %ecx jnc L(less4bytes) @@ -370,8 +379,13 @@ L(more4bytes): movzbl 7(%eax), %ecx cmpb %cl, 7(%edx) jne L(neq) +#if 0 + // XXX bug in original code. It had a fallthru without any code cmpl $0, %ecx je L(eq) +#else + jmp L(eq) +#endif END (STRCMP) diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S index 14caae29a1..338b00339d 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -160,6 +160,9 @@ L(crosspage): PUSH (%ebx) PUSH (%edi) PUSH (%esi) +#ifdef USE_AS_STRNCMP + cfi_remember_state +#endif movl %edx, %edi movl %eax, %ecx @@ -254,7 +257,7 @@ L(loop_ashr_0): /* * The following cases will be handled by ashr_1 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(15) n -15 0(15 +(n-15) - n) ashr_1 */ .p2align 4 @@ -360,7 +363,7 @@ L(ashr_1_exittail): /* * The following cases will be handled by ashr_2 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 */ .p2align 4 @@ -467,7 +470,7 @@ L(ashr_2_exittail): /* * The following cases will be handled by ashr_3 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 */ .p2align 4 @@ -573,7 +576,7 @@ L(ashr_3_exittail): /* * The following cases will be handled by ashr_4 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 */ .p2align 4 @@ -682,7 +685,7 @@ L(ashr_4_exittail): /* * The following cases will be handled by ashr_5 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(11~15) n -11 4(15 +(n-11) - n) ashr_5 */ .p2align 4 @@ -788,7 +791,7 @@ L(ashr_5_exittail): /* * The following cases will be handled by ashr_6 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(10~15) n -10 5(15 +(n-10) - n) ashr_6 */ @@ -896,7 +899,7 @@ L(ashr_6_exittail): /* * The following cases will be handled by ashr_7 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7 */ @@ -1006,7 +1009,7 @@ L(ashr_7_exittail): /* * The following cases will be handled by ashr_8 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8 */ .p2align 4 @@ -1113,7 +1116,7 @@ L(ashr_8_exittail): /* * The following cases will be handled by ashr_9 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9 */ .p2align 4 @@ -1219,7 +1222,7 @@ L(ashr_9_exittail): /* * The following cases will be handled by ashr_10 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10 */ .p2align 4 @@ -1325,7 +1328,7 @@ L(ashr_10_exittail): /* * The following cases will be handled by ashr_11 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11 */ .p2align 4 @@ -1431,7 +1434,7 @@ L(ashr_11_exittail): /* * The following cases will be handled by ashr_12 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12 */ .p2align 4 @@ -1537,7 +1540,7 @@ L(ashr_12_exittail): /* * The following cases will be handled by ashr_13 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13 */ .p2align 4 @@ -1643,7 +1646,7 @@ L(ashr_13_exittail): /* * The following cases will be handled by ashr_14 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14 */ .p2align 4 @@ -1749,7 +1752,7 @@ L(ashr_14_exittail): /* * The following cases will be handled by ashr_14 - * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15 */ @@ -1916,6 +1919,9 @@ L(less16bytes): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte0): #ifdef USE_AS_STRNCMP cmp $0, %ebp @@ -1931,6 +1937,9 @@ L(Byte0): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte1): #ifdef USE_AS_STRNCMP cmp $1, %ebp @@ -1946,6 +1955,9 @@ L(Byte1): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte2): #ifdef USE_AS_STRNCMP cmp $2, %ebp @@ -1961,6 +1973,9 @@ L(Byte2): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte3): #ifdef USE_AS_STRNCMP cmp $3, %ebp @@ -1976,6 +1991,9 @@ L(Byte3): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte4): #ifdef USE_AS_STRNCMP cmp $4, %ebp @@ -1989,7 +2007,11 @@ L(Byte4): POP (%ebp) #endif ret + .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte5): #ifdef USE_AS_STRNCMP cmp $5, %ebp @@ -2005,6 +2027,9 @@ L(Byte5): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(Byte6): #ifdef USE_AS_STRNCMP cmp $6, %ebp @@ -2020,6 +2045,9 @@ L(Byte6): ret .p2align 4 +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(2next_8_bytes): add $8, %eax add $8, %edx @@ -2063,6 +2091,9 @@ L(2next_8_bytes): #endif ret +#ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) +#endif L(neq): mov $1, %eax ja L(neq_bigger) @@ -2074,6 +2105,7 @@ L(neq_bigger): ret #ifdef USE_AS_STRNCMP + cfi_remember_state L(more8byteseq): POP (%esi) POP (%edi) @@ -2087,7 +2119,9 @@ L(eq): #endif xorl %eax, %eax ret + #ifdef USE_AS_STRNCMP + CFI_PUSH (%ebp) L(less16bytes_sncmp): test %ebp, %ebp jz L(eq) From d22ae6cf50dfb60e5f13fef8be5a3c8636733947 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Mon, 15 Feb 2010 18:47:02 -0800 Subject: [PATCH 18/27] Remove commented-out code. --- sysdeps/i386/i686/multiarch/strcmp-sse4.S | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index 4b47851ed4..e26f434222 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -379,13 +379,7 @@ L(more4bytes): movzbl 7(%eax), %ecx cmpb %cl, 7(%edx) jne L(neq) -#if 0 - // XXX bug in original code. It had a fallthru without any code - cmpl $0, %ecx - je L(eq) -#else jmp L(eq) -#endif END (STRCMP) From 0ea5278d33e7ad9b62c1d5d97cb287fdfcb8d977 Mon Sep 17 00:00:00 2001 From: Carl Fredrik Hammar Date: Wed, 17 Feb 2010 11:04:30 -0800 Subject: [PATCH 19/27] Use ioctl_handler_t typedef in Hurd ioctl macros. --- ChangeLog | 5 +++++ hurd/hurd/ioctl.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 1595a0a55e..5c5144b6fc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-02-17 Carl Fredrik Hammar + + * hurd/hurd/ioctl.h (_HURD_HANDLE_IOCTLS_1): Cast to + `ioctl_handler_t'. + 2010-02-15 Ulrich Drepper * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Fix unwind info. diff --git a/hurd/hurd/ioctl.h b/hurd/hurd/ioctl.h index ee156f02f9..e5ab3dc965 100644 --- a/hurd/hurd/ioctl.h +++ b/hurd/hurd/ioctl.h @@ -57,7 +57,7 @@ extern int hurd_register_ioctl_handler (int first_request, int last_request, static const struct ioctl_handler handler##_ioctl_handler##moniker \ __attribute__ ((__unused__)) = \ { _IOC_NOTYPE (first), _IOC_NOTYPE (last), \ - (int (*) (int, int, void *)) (handler), NULL }; \ + (ioctl_handler_t) (handler), NULL }; \ text_set_element (_hurd_ioctl_handler_lists, \ handler##_ioctl_handler##moniker) #define _HURD_HANDLE_IOCTLS(handler, first, last) \ From 951ca0c5ff38620c658be73701bcbc075ae4a53f Mon Sep 17 00:00:00 2001 From: Carl Fredrik Hammar Date: Wed, 17 Feb 2010 12:41:11 -0800 Subject: [PATCH 20/27] Clean up Hurd TIOCSCTTY. --- ChangeLog | 3 +++ hurd/hurdioctl.c | 44 +++++++++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5c5144b6fc..a3e79d555e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2010-02-17 Carl Fredrik Hammar + * hurd/hurdioctl.c (tiocsctty): Only get FD ports, do work in... + (tiocsctty_port): ...this new function. + * hurd/hurd/ioctl.h (_HURD_HANDLE_IOCTLS_1): Cast to `ioctl_handler_t'. diff --git a/hurd/hurdioctl.c b/hurd/hurdioctl.c index 7c689841ca..1da8c0599d 100644 --- a/hurd/hurdioctl.c +++ b/hurd/hurdioctl.c @@ -1,5 +1,5 @@ /* ioctl commands which must be done in the C library. - Copyright (C) 1994,95,96,97,99,2001,2002,2009 + Copyright (C) 1994,95,96,97,99,2001,2002,2009,2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -239,34 +239,40 @@ _hurd_setcttyid (mach_port_t cttyid) } -/* Make FD be the controlling terminal. - This function is called for `ioctl (fd, TCIOSCTTY)'. */ - -static int -tiocsctty (int fd, - int request) /* Always TIOCSCTTY. */ +static inline error_t +do_tiocsctty (io_t port, io_t ctty) { mach_port_t cttyid; error_t err; - /* Get FD's cttyid port, unless it is already ours. */ - err = HURD_DPORT_USE (fd, ctty != MACH_PORT_NULL ? EADDRINUSE : - __term_getctty (port, &cttyid)); - if (err == EADDRINUSE) - /* FD is already the ctty. Nothing to do. */ + if (ctty != MACH_PORT_NULL) + /* PORT is already the ctty. Nothing to do. */ return 0; - else if (err) - return __hurd_fail (err); + + /* Get PORT's cttyid port. */ + err = __term_getctty (port, &cttyid); + if (err) + return err; /* Change the terminal's pgrp to ours. */ - err = HURD_DPORT_USE (fd, __tioctl_tiocspgrp (port, _hurd_pgrp)); + err = __tioctl_tiocspgrp (port, _hurd_pgrp); if (err) - return __hurd_fail (err); + __mach_port_deallocate (__mach_task_self (), cttyid); + else + /* Make it our own. */ + install_ctty (cttyid); - /* Make it our own. */ - install_ctty (cttyid); + return err; +} - return 0; +/* Make FD be the controlling terminal. + This function is called for `ioctl (fd, TCIOSCTTY)'. */ + +static int +tiocsctty (int fd, + int request) /* Always TIOCSCTTY. */ +{ + return __hurd_fail (HURD_DPORT_USE (fd, tiocsctty_port (port, ctty))); } _HURD_HANDLE_IOCTL (tiocsctty, TIOCSCTTY); From 82abe82ad7e24a1c1f350fa78ea23a3e6caadff5 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 17 Feb 2010 22:13:55 -0800 Subject: [PATCH 21/27] Fix and cleanup unwind info in x86 strcmp-ssse. --- ChangeLog | 6 ++ sysdeps/i386/i686/multiarch/strcmp-ssse3.S | 87 ++++------------------ 2 files changed, 20 insertions(+), 73 deletions(-) diff --git a/ChangeLog b/ChangeLog index a3e79d555e..ab028f632f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2010-02-17 H.J. Lu + Ulrich Drepper + + * sysdeps/i386/i686/multiarch/strcmp-ssse3.S: Fix typo in unwind info. + Clean up a bit. + 2010-02-17 Carl Fredrik Hammar * hurd/hurdioctl.c (tiocsctty): Only get FD ports, do work in... diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S index 338b00339d..40994c05b1 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -40,6 +40,7 @@ # endif # define STR1 4 # define STR2 STR1+4 +# define RETURN ret; .p2align 4 # define UPDATE_STRNCMP_COUNTER #else # ifndef STRCMP @@ -48,7 +49,7 @@ # define STR1 8 # define STR2 STR1+4 # define CNT STR2+4 - +# define RETURN POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp) # define UPDATE_STRNCMP_COUNTER \ /* calculate left number to compare */ \ mov $16, %esi; \ @@ -1913,15 +1914,8 @@ L(less16bytes): movzx 7(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte0): #ifdef USE_AS_STRNCMP cmp $0, %ebp @@ -1931,15 +1925,8 @@ L(Byte0): movzx (%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte1): #ifdef USE_AS_STRNCMP cmp $1, %ebp @@ -1949,15 +1936,8 @@ L(Byte1): movzx 1(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte2): #ifdef USE_AS_STRNCMP cmp $2, %ebp @@ -1967,15 +1947,8 @@ L(Byte2): movzx 2(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte3): #ifdef USE_AS_STRNCMP cmp $3, %ebp @@ -1985,15 +1958,8 @@ L(Byte3): movzx 3(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte4): #ifdef USE_AS_STRNCMP cmp $4, %ebp @@ -2003,15 +1969,8 @@ L(Byte4): movzx 4(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte5): #ifdef USE_AS_STRNCMP cmp $5, %ebp @@ -2021,15 +1980,8 @@ L(Byte5): movzx 5(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(Byte6): #ifdef USE_AS_STRNCMP cmp $6, %ebp @@ -2039,15 +1991,8 @@ L(Byte6): movzx 6(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(2next_8_bytes): add $8, %eax add $8, %edx @@ -2086,14 +2031,8 @@ L(2next_8_bytes): movzx 7(%edx), %eax sub %ecx, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(neq): mov $1, %eax ja L(neq_bigger) @@ -2105,7 +2044,8 @@ L(neq_bigger): ret #ifdef USE_AS_STRNCMP - cfi_remember_state + .p2align 4 + cfi_restore_state L(more8byteseq): POP (%esi) POP (%edi) @@ -2121,6 +2061,7 @@ L(eq): ret #ifdef USE_AS_STRNCMP + .p2align 4 CFI_PUSH (%ebp) L(less16bytes_sncmp): test %ebp, %ebp From 28be6098c306bcfcffc1a0b356b76c95c20dfddf Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 17 Feb 2010 22:27:41 -0800 Subject: [PATCH 22/27] Simplify x86 strcmp-sse4 unwind info. --- ChangeLog | 4 ++++ sysdeps/i386/i686/multiarch/strcmp-sse4.S | 22 +++++++--------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index ab028f632f..c8f89b8884 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2010-02-16 H.J. Lu + + * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Simplify unwind info. + 2010-02-17 H.J. Lu Ulrich Drepper diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index e26f434222..d5fd23e15c 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -40,6 +40,7 @@ # endif # define STR1 4 # define STR2 STR1+4 +# define RETURN ret; .p2align 4 #else # ifndef STRCMP # define STRCMP __strncmp_sse4_2 @@ -47,6 +48,7 @@ # define STR1 8 # define STR2 STR1+4 # define CNT STR2+4 +# define RETURN POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp) #endif .section .text.sse4.2,"ax",@progbits @@ -223,6 +225,7 @@ L(crosspage): add $16, %esi jmp L(check_offset) + .p2align 4 L(end): jnc L(ret) #ifdef USE_AS_STRNCMP @@ -242,6 +245,7 @@ L(ret): #endif ret + .p2align 4 cfi_restore_state #ifdef USE_AS_STRNCMP L(more16byteseq): @@ -251,27 +255,15 @@ L(more16byteseq): #endif L(eq): xorl %eax, %eax -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret + RETURN -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif L(neq): mov $1, %eax ja L(neq_bigger) neg %eax L(neq_bigger): -#ifdef USE_AS_STRNCMP - POP (%ebp) -#endif - ret - .p2align 4 -#ifdef USE_AS_STRNCMP - CFI_PUSH (%ebp) -#endif + RETURN + L(less16bytes): add $0xfefefeff, %ecx jnc L(less4bytes) From c60bce2cdd757a96077f2ff0147619abacfabfbb Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 17 Feb 2010 22:35:18 -0800 Subject: [PATCH 23/27] Fix unwind info in x86 memcmp-ssse3. --- ChangeLog | 2 ++ sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index c8f89b8884..600c65d30c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2010-02-16 H.J. Lu + * sysdeps/i386/i686/multiarch/memcmp-ssse3.S (less1bytes): Add CFI_POP. + * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Simplify unwind info. 2010-02-17 H.J. Lu diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S index 869f37a912..d2f852f726 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -60,6 +60,7 @@ ENTRY (MEMCMP) jmp L(less48bytes) ALIGN (4) + CFI_POP (%ebx) L(less1bytes): jb L(zero) movb (%eax), %cl @@ -156,7 +157,6 @@ L(shr_0): sub $0xffff, %edx jnz L(exit) - lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx POP (%edi) @@ -1662,7 +1662,6 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) L(less48bytes): cmp $8, %ecx @@ -1679,9 +1678,6 @@ L(less48bytes): je L(6bytes) jmp L(7bytes) - - - ALIGN (4) L(44bytes): mov -44(%eax), %ecx From 020ecba7fc0cde6febac28e114a56f9eb47dc871 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 17 Feb 2010 23:01:55 -0800 Subject: [PATCH 24/27] Align x86 memcmp-sse4.S and fix unwind info. --- ChangeLog | 3 +++ sysdeps/i386/i686/multiarch/memcmp-sse4.S | 30 +++++++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index 600c65d30c..5f2d25cc85 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2010-02-16 H.J. Lu + * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Add alignnments. + Fix one unwind info problem. + * sysdeps/i386/i686/multiarch/memcmp-ssse3.S (less1bytes): Add CFI_POP. * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Simplify unwind info. diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S index 71c4e1c337..b1ed778f1f 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -96,8 +96,9 @@ ENTRY (MEMCMP) add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) -L(less8bytes): + ALIGN (4) +L(less8bytes): mov (%eax), %bl cmpb (%edx), %bl jne L(nonzero) @@ -154,7 +155,6 @@ L(0bytes): POP (%ebx) xor %eax, %eax ret - CFI_PUSH (%ebx) ALIGN (4) L(less1bytes): @@ -207,6 +207,8 @@ L(64bytesormore_loop): add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + + ALIGN (4) L(find_16diff): sub $16, %ecx L(find_32diff): @@ -217,8 +219,8 @@ L(find_64diff): add %ecx, %edx add %ecx, %eax jmp L(16bytes) - ALIGN (4) + ALIGN (4) L(16bytes): mov -16(%eax), %ecx mov -16(%edx), %ebx @@ -377,7 +379,7 @@ L(1bytes): jne L(end) RETURN - + ALIGN (4) L(52bytes): movdqu -52(%eax), %xmm1 movdqu -52(%edx), %xmm2 @@ -406,6 +408,7 @@ L(20bytes): jne L(find_diff) RETURN + ALIGN (4) L(53bytes): movdqu -53(%eax), %xmm1 movdqu -53(%edx), %xmm2 @@ -437,6 +440,7 @@ L(21bytes): jne L(end) RETURN + ALIGN (4) L(54bytes): movdqu -54(%eax), %xmm1 movdqu -54(%edx), %xmm2 @@ -472,6 +476,7 @@ L(22bytes): jne L(end) RETURN + ALIGN (4) L(55bytes): movdqu -55(%eax), %xmm1 movdqu -55(%edx), %xmm2 @@ -509,6 +514,7 @@ L(23bytes): jne L(end) RETURN + ALIGN (4) L(56bytes): movdqu -56(%eax), %xmm1 movdqu -56(%edx), %xmm2 @@ -543,6 +549,7 @@ L(24bytes): jne L(find_diff) RETURN + ALIGN (4) L(57bytes): movdqu -57(%eax), %xmm1 movdqu -57(%edx), %xmm2 @@ -578,6 +585,7 @@ L(25bytes): jne L(end) RETURN + ALIGN (4) L(58bytes): movdqu -58(%eax), %xmm1 movdqu -58(%edx), %xmm2 @@ -619,6 +627,7 @@ L(26bytes): jne L(end) RETURN + ALIGN (4) L(59bytes): movdqu -59(%eax), %xmm1 movdqu -59(%edx), %xmm2 @@ -660,6 +669,7 @@ L(27bytes): jne L(end) RETURN + ALIGN (4) L(60bytes): movdqu -60(%eax), %xmm1 movdqu -60(%edx), %xmm2 @@ -696,6 +706,7 @@ L(28bytes): jne L(find_diff) RETURN + ALIGN (4) L(61bytes): movdqu -61(%eax), %xmm1 movdqu -61(%edx), %xmm2 @@ -738,6 +749,7 @@ L(29bytes): jne L(end) RETURN + ALIGN (4) L(62bytes): movdqu -62(%eax), %xmm1 movdqu -62(%edx), %xmm2 @@ -780,6 +792,7 @@ L(30bytes): jne L(end) RETURN + ALIGN (4) L(63bytes): movdqu -63(%eax), %xmm1 movdqu -63(%edx), %xmm2 @@ -826,6 +839,7 @@ L(31bytes): jne L(end) RETURN + ALIGN (4) L(64bytes): movdqu -64(%eax), %xmm1 movdqu -64(%edx), %xmm2 @@ -870,6 +884,7 @@ L(32bytes): jne L(find_diff) RETURN + ALIGN (4) L(less16bytes): add %ebx, %eax add %ebx, %edx @@ -914,8 +929,11 @@ L(end): neg %eax L(bigger): ret +END (MEMCMP) + .section .rodata.sse4.2,"a",@progbits ALIGN (2) + .type L(table_64bytes), @object L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes)) @@ -982,7 +1000,5 @@ L(table_64bytes): .int JMPTBL (L(62bytes), L(table_64bytes)) .int JMPTBL (L(63bytes), L(table_64bytes)) .int JMPTBL (L(64bytes), L(table_64bytes)) - -END (MEMCMP) - + .size L(table_64bytes), .-L(table_64bytes) #endif From 039c8ae6d5d54e1e677b317a0131480c6036a8e3 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 18 Feb 2010 23:11:21 -0800 Subject: [PATCH 25/27] Use CPUID_OFFSET instead of FEATURE_OFFSET --- ChangeLog | 4 ++++ sysdeps/i386/i686/multiarch/memcmp.S | 2 +- sysdeps/i386/i686/multiarch/strcmp.S | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5f2d25cc85..69906fa642 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2010-02-16 H.J. Lu + * sysdeps/i386/i686/multiarch/memcmp.S (memcmp): Use CPUID_OFFSET + instead of FEATURE_OFFSET. + * sysdeps/i386/i686/multiarch/strcmp.S (strcmp): Likewise. + * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Add alignnments. Fix one unwind info problem. diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S index fa7c52a003..cf606a5959 100644 --- a/sysdeps/i386/i686/multiarch/memcmp.S +++ b/sysdeps/i386/i686/multiarch/memcmp.S @@ -58,7 +58,7 @@ ENTRY(memcmp) testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features jz 2f leal __memcmp_ssse3, %eax - testl $bit_SSE4_2, FEATURE_OFFSET+index_SSE4_2+__cpu_features + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features jz 2f leal __memcmp_sse4_2, %eax 2: ret diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S index 79a1fdfd43..7136d47e85 100644 --- a/sysdeps/i386/i686/multiarch/strcmp.S +++ b/sysdeps/i386/i686/multiarch/strcmp.S @@ -83,7 +83,7 @@ ENTRY(STRCMP) testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features jz 2f leal __STRCMP_SSSE3, %eax - testl $bit_SSE4_2, FEATURE_OFFSET+index_SSE4_2+__cpu_features + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features jz 2f leal __STRCMP_SSE4_2, %eax 2: ret From 8c3fe38d2541f9597c34192a8ca44cd4f9cdefe0 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 18 Feb 2010 23:12:10 -0800 Subject: [PATCH 26/27] Whitespace fix. --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 69906fa642..e458ee1fff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,7 +3,7 @@ * sysdeps/i386/i686/multiarch/memcmp.S (memcmp): Use CPUID_OFFSET instead of FEATURE_OFFSET. * sysdeps/i386/i686/multiarch/strcmp.S (strcmp): Likewise. - + * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Add alignnments. Fix one unwind info problem. From 199428c19774c12b3c4b6e6486ea9d4a021288af Mon Sep 17 00:00:00 2001 From: Carl Fredrik Hammar Date: Fri, 19 Feb 2010 11:08:00 -0800 Subject: [PATCH 27/27] Fix Hurd tiocsctty change. --- ChangeLog | 5 +++++ hurd/hurdioctl.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index e458ee1fff..1044b4de65 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-02-19 Carl Fredrik Hammar + + * hurd/hurdioctl.c (tiocsctty): Call `do_tiocsctty' instead of + non-existent `tiocsctty_port'. + 2010-02-16 H.J. Lu * sysdeps/i386/i686/multiarch/memcmp.S (memcmp): Use CPUID_OFFSET diff --git a/hurd/hurdioctl.c b/hurd/hurdioctl.c index 1da8c0599d..04d98629ef 100644 --- a/hurd/hurdioctl.c +++ b/hurd/hurdioctl.c @@ -272,7 +272,7 @@ static int tiocsctty (int fd, int request) /* Always TIOCSCTTY. */ { - return __hurd_fail (HURD_DPORT_USE (fd, tiocsctty_port (port, ctty))); + return __hurd_fail (HURD_DPORT_USE (fd, do_tiocsctty (port, ctty))); } _HURD_HANDLE_IOCTL (tiocsctty, TIOCSCTTY);