diff --git a/ChangeLog b/ChangeLog
index cc4855fc27..d5b67e804f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,184 @@
+2010-02-19  Carl Fredrik Hammar  <hammy.lite@gmail.com>
+
+	* hurd/hurdioctl.c (tiocsctty): Call `do_tiocsctty' instead of
+	non-existent `tiocsctty_port'.
+
+2010-02-16  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/memcmp.S (memcmp): Use CPUID_OFFSET
+	instead of FEATURE_OFFSET.
+	* sysdeps/i386/i686/multiarch/strcmp.S (strcmp): Likewise.
+
+	* sysdeps/i386/i686/multiarch/memcmp-sse4.S: Add alignnments.
+	Fix one unwind info problem.
+
+	* sysdeps/i386/i686/multiarch/memcmp-ssse3.S (less1bytes): Add CFI_POP.
+
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S: Simplify unwind info.
+
+2010-02-17  H.J. Lu  <hongjiu.lu@intel.com>
+	    Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/i386/i686/multiarch/strcmp-ssse3.S: Fix typo in unwind info.
+	Clean up a bit.
+
+2010-02-17  Carl Fredrik Hammar  <hammy.lite@gmail.com>
+
+	* hurd/hurdioctl.c (tiocsctty): Only get FD ports, do work in...
+	(tiocsctty_port): ...this new function.
+
+	* hurd/hurd/ioctl.h (_HURD_HANDLE_IOCTLS_1): Cast to
+	`ioctl_handler_t'.
+
+2010-02-15  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/i386/i686/multiarch/memcmp-sse4.S: Fix unwind info.
+	* sysdeps/i386/i686/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S: Likewise.
+	* sysdeps/i386/i686/multiarch/strcmp-ssse3.S: Likewise.
+
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S: Don't fall through to
+	undefined code.
+
+2010-02-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+	strcmp-ssse3, strcmp-sse4, strncmp-c, strncmp-ssse3, strncmp-sse4,
+	memcmp-c, memcmp-ssse3, and memcmp-sse4.
+	* sysdeps/i386/i686/multiarch/memcmp-sse4.S: New file.
+	* sysdeps/i386/i686/multiarch/memcmp-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/memcmp.S: New file.
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S: New file.
+	* sysdeps/i386/i686/multiarch/strcmp-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/strcmp.S: New file.
+	* sysdeps/i386/i686/multiarch/strncmp-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strncmp-sse4.S: New file.
+	* sysdeps/i386/i686/multiarch/strncmp-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/strncmp.S: New file.
+
+2010-02-12  Luis Machado  <luisgpm@br.ibm.com>
+
+	* sysdeps/powerpc/powerpc32/dl-machine.h: Removed old PPC_REL16 check.
+	* sysdeps/powerpc/powerpc32/dl-machine.h: Likewise.
+	* sysdeps/powerpc/powerpc32/elf/start.S: Likewise.
+	* sysdeps/powerpc/powerpc32/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/configure.in: Fail if R_PPC_REL16
+	is not supported.
+	* sysdeps/powerpc/powerpc32/fpu/s_round.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_truncf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_floorf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_ceilf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_ceil.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_floor.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_roundf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_rintf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_trunc.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/setjmp-common.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_lround.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_rint.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/dl-start.S: Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S:
+	Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S:
+	Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S:
+	Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S:
+	Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S: Likewise.
+
+2010-02-12  Alan Modra  <amodra@gmail.com>
+
+	* elf/tls-macros.h [__powerpc__] (__TLS_CALL_CLOBBERS): Remove r3.
+	Define and use for __powerpc64__ too.
+	[__powerpc__] (TLS_LD): Add r3 to clobbers.
+	[__powerpc__] (TLS_GD): Set asm output.  Make __result r3 reg.
+	[__powerpc64__] (TLS_GD): Make __result r3 reg.
+	[__powerpc64__] (TLS_IE): Relax output constraint.
+
+2010-02-11  Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
+
+	* sysdeps/s390/s390-64/utf8-utf16-z9.c: Disable hardware
+	instructions cu21 and cu24.  Add well-formedness checking
+	parameter and adjust the software implementation.
+	* sysdeps/s390/s390-64/utf16-utf32-z9.c: Likewise.
+
+2010-02-10  Ulrich Drepper  <drepper@redhat.com>
+
+	[BZ #11271]
+	* io/ftw.c (ftw_startup): Close	descriptor for initial directory
+	after changing back to it.
+
+2010-02-05  David S. Miller  <davem@davemloft.net>
+
+	* elf/elf.h (R_SPARC_JMP_IREL, R_SPARC_IRELATIVE): Define.
+	* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_rela): Handle new
+	ifunc relocs.
+	(elf_machine_lazy_rel): Likewise.
+	(sparc_fixup_plt): Pull out to...
+	* sysdeps/sparc/sparc32/dl-plt.h: ...here.
+	* sysdeps/sparc/sparc32/dl-irel.h: New file.
+	* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_rela): Handle new
+	ifunc relocs.
+	(elf_machine_lazy_rel): Likewise.
+	(sparc64_fixup_plt): Pull out to...
+	* sysdeps/sparc/sparc64/dl-plt.h: ...here.
+	* sysdeps/sparc/sparc64/dl-irel.h: New file.
+
+2010-02-09  Maxim Kuvyrkov  <maxim@codesourcery.com>
+
+	* elf/elf.h: Define m68k TLS relocations.
+
+2010-02-10  Luis Machado  <luisgpm@br.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/Implies: Removed.
+	* sysdeps/powerpc/powerpc64/power7/fpu/Implies: Removed.
+	* sysdeps/powerpc/powerpc32/power7/Implies: Removed.
+	* sysdeps/powerpc/powerpc32/power7/fpu/Implies: Removed.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies: Add
+	64-bit power7 fpu path.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies: Add
+	32-bit power7 fpu math.
+
+2010-02-09  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/i386/rshift.S: More compact unwind information.
+
+	* sysdeps/i386/lshift.S: Fix unwind information.
+
+2010-02-08  Luis Machado  <luisgpm@br.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S: New file.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S: New file.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S: New file.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S: New file.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S: New file.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S: New file.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S: New file.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S: New file.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S: New file.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S: New file.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S: New file.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S: New file.
+
+2010-02-08  Andreas Schwab  <schwab@redhat.com>
+
+	* include/fenv.h: Add hidden proto for fegetenv.
+	* math/fegetenv.c: Add hidden alias.
+	* sysdeps/i386/fpu/fegetenv.c: Likewise.
+	* sysdeps/ia64/fpu/fegetenv.c: Likewise.
+	* sysdeps/powerpc/fpu/fegetenv.c: Likewise.
+	* sysdeps/sh/sh4/fpu/fegetenv.c: Likewise.
+	* sysdeps/sparc/fpu/fegetenv.c: Likewise.
+	* sysdeps/x86_64/fpu/fegetenv.c: Likewise
+	* sysdeps/s390/fpu/fegetenv.c: Likewise.  Remove unused headers.
+
 2010-02-12  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: Use unsigned
diff --git a/README b/README
index 322b4dac4a..7c27e3652a 100644
--- a/README
+++ b/README
@@ -14,20 +14,23 @@ In GNU/Hurd systems, it works with a microkernel and Hurd servers.
 The GNU C Library implements much of the POSIX.1 functionality in the
 GNU/Hurd system, using configurations i[34567]86-*-gnu.
 
-When working with Linux kernels, the GNU C Library version 2.4 is
-intended primarily for use with Linux kernel version 2.6.0 and later.
-We only support using the NPTL implementation of pthreads, which is now
-the default configuration.  Most of the C library will continue to work
-on older Linux kernels and many programs will not require a 2.6 kernel
-to run correctly.  However, pthreads and related functionality will not
-work at all on old kernels and we do not recommend using glibc 2.4 with
-any Linux kernel prior to 2.6.
+When working with Linux kernels, the GNU C Library version from
+version 2.4 on is intended primarily for use with Linux kernel version
+2.6.0 and later.  We only support using the NPTL implementation of
+pthreads, which is now the default configuration.  Most of the C
+library will continue to work on older Linux kernels and many programs
+will not require a 2.6 kernel to run correctly.  However, pthreads and
+related functionality will not work at all on old kernels and we do
+not recommend using glibc 2.4 with any Linux kernel prior to 2.6.
 
 All Linux kernel versions prior to 2.6.16 are known to have some bugs that
 may cause some of the tests related to pthreads in "make check" to fail.
 If you see such problems, please try the test suite on the most recent
 Linux kernel version that you can use, before pursuing those bugs further.
 
+Also note that the shared version of the libgcc_s library must be
+installed for the pthread library to work correctly.
+
 The old LinuxThreads add-on implementation of pthreads for older Linux
 kernels is no longer supported, and we are not distributing it with this
 release.  Someone has volunteered to revive its maintenance unofficially
@@ -48,7 +51,6 @@ The GNU C Library supports these configurations for using Linux kernels:
 	sparc*-*-linux-gnu
 	sparc64*-*-linux-gnu
 
-	alpha*-*-linux-gnu	Requires Linux 2.6.9 for NPTL
 	sh[34]-*-linux-gnu	Requires Linux 2.6.11
 
 The code for other CPU configurations supported by volunteers outside of
@@ -57,6 +59,7 @@ add-on.  You can find glibc-ports-VERSION distributed separately in the
 same place where you got the main glibc distribution files.
 Currently these configurations are known to work using the `ports' add-on:
 
+	alpha*-*-linux-gnu	Requires Linux 2.6.9 for NPTL
 	arm-*-linux-gnu		Requires Linux 2.6.15 for NPTL, no SMP support
 	arm-*-linux-gnueabi	Requires Linux 2.6.16-rc1 for NPTL, no SMP
 	mips-*-linux-gnu	Requires Linux 2.6.12 for NPTL
diff --git a/elf/elf.h b/elf/elf.h
index 8af7c177ce..204a0f9e19 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -1123,8 +1123,29 @@ typedef struct
 #define R_68K_GLOB_DAT	20		/* Create GOT entry */
 #define R_68K_JMP_SLOT	21		/* Create PLT entry */
 #define R_68K_RELATIVE	22		/* Adjust by program base */
+#define R_68K_TLS_GD32      25          /* 32 bit GOT offset for GD */
+#define R_68K_TLS_GD16      26          /* 16 bit GOT offset for GD */
+#define R_68K_TLS_GD8       27          /* 8 bit GOT offset for GD */
+#define R_68K_TLS_LDM32     28          /* 32 bit GOT offset for LDM */
+#define R_68K_TLS_LDM16     29          /* 16 bit GOT offset for LDM */
+#define R_68K_TLS_LDM8      30          /* 8 bit GOT offset for LDM */
+#define R_68K_TLS_LDO32     31          /* 32 bit module-relative offset */
+#define R_68K_TLS_LDO16     32          /* 16 bit module-relative offset */
+#define R_68K_TLS_LDO8      33          /* 8 bit module-relative offset */
+#define R_68K_TLS_IE32      34          /* 32 bit GOT offset for IE */
+#define R_68K_TLS_IE16      35          /* 16 bit GOT offset for IE */
+#define R_68K_TLS_IE8       36          /* 8 bit GOT offset for IE */
+#define R_68K_TLS_LE32      37          /* 32 bit offset relative to
+					   static TLS block */
+#define R_68K_TLS_LE16      38          /* 16 bit offset relative to
+					   static TLS block */
+#define R_68K_TLS_LE8       39          /* 8 bit offset relative to
+					   static TLS block */
+#define R_68K_TLS_DTPMOD32  40          /* 32 bit module number */
+#define R_68K_TLS_DTPREL32  41          /* 32 bit module-relative offset */
+#define R_68K_TLS_TPREL32   42          /* 32 bit TP-relative offset */
 /* Keep this the last entry.  */
-#define R_68K_NUM	23
+#define R_68K_NUM	43
 
 /* Intel 80386 specific definitions.  */
 
@@ -1303,6 +1324,8 @@ typedef struct
 #define R_SPARC_H34		85
 #define R_SPARC_SIZE32		86
 #define R_SPARC_SIZE64		87
+#define R_SPARC_JMP_IREL	248
+#define R_SPARC_IRELATIVE	249
 #define R_SPARC_GNU_VTINHERIT	250
 #define R_SPARC_GNU_VTENTRY	251
 #define R_SPARC_REV32		252
diff --git a/elf/tls-macros.h b/elf/tls-macros.h
index 6463a6c3f9..781256db1e 100644
--- a/elf/tls-macros.h
+++ b/elf/tls-macros.h
@@ -701,154 +701,146 @@ register void *__gp __asm__("$29");
      (int *) (__builtin_thread_pointer() + __offset); })
 # endif
 
-#elif defined __powerpc__ && !defined __powerpc64__
+#elif defined __powerpc__
 
-#include "config.h"
-
-# define __TLS_CALL_CLOBBERS						\
-	"0", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",	\
+# define __TLS_CALL_CLOBBERS						      \
+	"0", "4", "5", "6", "7", "8", "9", "10", "11", "12",		      \
 	"lr", "ctr", "cr0", "cr1", "cr5", "cr6", "cr7"
 
+# ifndef __powerpc64__
+
+#  include "config.h"
+
 /* PowerPC32 Local Exec TLS access.  */
-# define TLS_LE(x)				\
-  ({ int *__result;				\
-     asm ("addi %0,2," #x "@tprel"		\
-	  : "=r" (__result));			\
+#  define TLS_LE(x)							      \
+  ({ int *__result;							      \
+     asm ("addi %0,2," #x "@tprel"					      \
+	  : "=r" (__result));						      \
      __result; })
 
 /* PowerPC32 Initial Exec TLS access.  */
-# ifdef HAVE_ASM_PPC_REL16
-#  define TLS_IE(x)					\
-  ({ int *__result;					\
-     asm ("bcl 20,31,1f\n1:\t"				\
-	  "mflr %0\n\t"					\
-	  "addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t"	\
-	  "addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n\t"	\
-	  "lwz %0," #x "@got@tprel(%0)\n\t"		\
-	  "add %0,%0," #x "@tls"			\
-	  : "=b" (__result) :				\
-	  : "lr");					\
+#  ifdef HAVE_ASM_PPC_REL16
+#   define TLS_IE(x)							      \
+  ({ int *__result;							      \
+     asm ("bcl 20,31,1f\n1:\t"						      \
+	  "mflr %0\n\t"							      \
+	  "addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t"			      \
+	  "addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n\t"			      \
+	  "lwz %0," #x "@got@tprel(%0)\n\t"				      \
+	  "add %0,%0," #x "@tls"					      \
+	  : "=b" (__result) :						      \
+	  : "lr");							      \
      __result; })
-# else
-#  define TLS_IE(x)					\
-  ({ int *__result;					\
-     asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t"	\
-	  "mflr %0\n\t"					\
-	  "lwz %0," #x "@got@tprel(%0)\n\t"		\
-	  "add %0,%0," #x "@tls"			\
-	  : "=b" (__result) :				\
-	  : "lr");					\
+#  else
+#   define TLS_IE(x)							      \
+  ({ int *__result;							      \
+     asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t"			      \
+	  "mflr %0\n\t"							      \
+	  "lwz %0," #x "@got@tprel(%0)\n\t"				      \
+	  "add %0,%0," #x "@tls"					      \
+	  : "=b" (__result) :						      \
+	  : "lr");							      \
      __result; })
-# endif
+#  endif
 
 /* PowerPC32 Local Dynamic TLS access.  */
-# ifdef HAVE_ASM_PPC_REL16
-#  define TLS_LD(x)					\
-  ({ int *__result;					\
-     asm ("bcl 20,31,1f\n1:\t"				\
-	  "mflr 3\n\t"					\
-	  "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t"	\
-	  "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t"	\
-	  "addi 3,3," #x "@got@tlsld\n\t"		\
-	  "bl __tls_get_addr@plt\n\t"			\
-	  "addi %0,3," #x "@dtprel"			\
-	  : "=r" (__result) :				\
-	  : __TLS_CALL_CLOBBERS);			\
+#  ifdef HAVE_ASM_PPC_REL16
+#   define TLS_LD(x)							      \
+  ({ int *__result;							      \
+     asm ("bcl 20,31,1f\n1:\t"						      \
+	  "mflr 3\n\t"							      \
+	  "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t"			      \
+	  "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t"			      \
+	  "addi 3,3," #x "@got@tlsld\n\t"				      \
+	  "bl __tls_get_addr@plt\n\t"					      \
+	  "addi %0,3," #x "@dtprel"					      \
+	  : "=r" (__result) :						      \
+	  : "3", __TLS_CALL_CLOBBERS);					      \
      __result; })
-# else
-#  define TLS_LD(x)					\
-  ({ int *__result;					\
-     asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t"	\
-	  "mflr 3\n\t"					\
-	  "addi 3,3," #x "@got@tlsld\n\t"		\
-	  "bl __tls_get_addr@plt\n\t"			\
-	  "addi %0,3," #x "@dtprel"			\
-	  : "=r" (__result) :				\
-	  : __TLS_CALL_CLOBBERS);			\
+#  else
+#   define TLS_LD(x)							      \
+  ({ int *__result;							      \
+     asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t"			      \
+	  "mflr 3\n\t"							      \
+	  "addi 3,3," #x "@got@tlsld\n\t"				      \
+	  "bl __tls_get_addr@plt\n\t"					      \
+	  "addi %0,3," #x "@dtprel"					      \
+	  : "=r" (__result) :						      \
+	  : "3", __TLS_CALL_CLOBBERS);					      \
      __result; })
-# endif
+#  endif
 
 /* PowerPC32 General Dynamic TLS access.  */
-# ifdef HAVE_ASM_PPC_REL16
-#  define TLS_GD(x)					\
-  ({ register int *__result __asm__ ("r3");		\
-     asm ("bcl 20,31,1f\n1:\t"				\
-	  "mflr 3\n\t"					\
-	  "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t"	\
-	  "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t"	\
-	  "addi 3,3," #x "@got@tlsgd\n\t"		\
-	  "bl __tls_get_addr@plt"			\
-	  : :						\
-	  : __TLS_CALL_CLOBBERS);			\
+#  ifdef HAVE_ASM_PPC_REL16
+#   define TLS_GD(x)							      \
+  ({ register int *__result __asm__ ("r3");				      \
+     asm ("bcl 20,31,1f\n1:\t"						      \
+	  "mflr 3\n\t"							      \
+	  "addis 3,3,_GLOBAL_OFFSET_TABLE_-1b@ha\n\t"			      \
+	  "addi 3,3,_GLOBAL_OFFSET_TABLE_-1b@l\n\t"			      \
+	  "addi 3,3," #x "@got@tlsgd\n\t"				      \
+	  "bl __tls_get_addr@plt"					      \
+	  : "=r" (__result) :						      \
+	  : __TLS_CALL_CLOBBERS);					      \
      __result; })
-# else
-#  define TLS_GD(x)					\
-  ({ register int *__result __asm__ ("r3");		\
-     asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t"	\
-	  "mflr 3\n\t"					\
-	  "addi 3,3," #x "@got@tlsgd\n\t"		\
-	  "bl __tls_get_addr@plt"			\
-	  : :						\
-	  : __TLS_CALL_CLOBBERS);			\
+#  else
+#   define TLS_GD(x)							      \
+  ({ register int *__result __asm__ ("r3");				      \
+     asm ("bl _GLOBAL_OFFSET_TABLE_@local-4\n\t"			      \
+	  "mflr 3\n\t"							      \
+	  "addi 3,3," #x "@got@tlsgd\n\t"				      \
+	  "bl __tls_get_addr@plt"					      \
+	  : "=r" (__result) :						      \
+	  : __TLS_CALL_CLOBBERS);					      \
      __result; })
-# endif
+#  endif
 
-#elif defined __powerpc__ && defined __powerpc64__
+# else
 
 /* PowerPC64 Local Exec TLS access.  */
-# define TLS_LE(x) \
-  ({  int * __result;  \
-      asm ( \
-        "  addis %0,13," #x "@tprel@ha\n"  \
-        "  addi  %0,%0," #x "@tprel@l\n"   \
-        : "=b" (__result) );  \
-      __result;  \
+#  define TLS_LE(x)							      \
+  ({ int * __result;							      \
+     asm ("addis %0,13," #x "@tprel@ha\n\t"				      \
+	  "addi  %0,%0," #x "@tprel@l"					      \
+	  : "=b" (__result) );						      \
+     __result;								      \
   })
 /* PowerPC64 Initial Exec TLS access.  */
-#  define TLS_IE(x) \
-  ({  int * __result;  \
-      asm (  \
-        "  ld  %0," #x "@got@tprel(2)\n"  \
-        "  add %0,%0," #x "@tls\n"   \
-        : "=b" (__result) );  \
-      __result;  \
+#  define TLS_IE(x)							      \
+  ({ int * __result;							      \
+     asm ("ld  %0," #x "@got@tprel(2)\n\t"				      \
+	  "add %0,%0," #x "@tls"					      \
+	  : "=r" (__result) );						      \
+     __result;								      \
   })
-# ifdef HAVE_ASM_GLOBAL_DOT_NAME
-#  define __TLS_GET_ADDR ".__tls_get_addr"
-# else
-#  define __TLS_GET_ADDR "__tls_get_addr"
-# endif
+#  ifdef HAVE_ASM_GLOBAL_DOT_NAME
+#   define __TLS_GET_ADDR ".__tls_get_addr"
+#  else
+#   define __TLS_GET_ADDR "__tls_get_addr"
+#  endif
 /* PowerPC64 Local Dynamic TLS access.  */
-#  define TLS_LD(x) \
-  ({  int * __result;  \
-      asm (  \
-        "  addi  3,2," #x "@got@tlsld\n"  \
-        "  bl    " __TLS_GET_ADDR "\n"  \
-        "  nop   \n"  \
-        "  addis %0,3," #x "@dtprel@ha\n"  \
-        "  addi  %0,%0," #x "@dtprel@l\n"  \
-        : "=b" (__result) :  \
-        : "0", "3", "4", "5", "6", "7",    \
-          "8", "9", "10", "11", "12",      \
-          "lr", "ctr",  \
-          "cr0", "cr1", "cr5", "cr6", "cr7");  \
-      __result;  \
+#  define TLS_LD(x)							      \
+  ({ int * __result;							      \
+     asm ("addi  3,2," #x "@got@tlsld\n\t"				      \
+	  "bl    " __TLS_GET_ADDR "\n\t"				      \
+	  "nop   \n\t"							      \
+	  "addis %0,3," #x "@dtprel@ha\n\t"				      \
+	  "addi  %0,%0," #x "@dtprel@l"					      \
+	  : "=b" (__result) :						      \
+	  : "3", __TLS_CALL_CLOBBERS);					      \
+     __result;								      \
   })
 /* PowerPC64 General Dynamic TLS access.  */
-#  define TLS_GD(x) \
-  ({  int * __result;  \
-      asm (  \
-        "  addi  3,2," #x "@got@tlsgd\n"  \
-        "  bl    " __TLS_GET_ADDR "\n"  \
-        "  nop   \n"  \
-        "  mr    %0,3\n"  \
-        : "=b" (__result) :  \
-        : "0", "3", "4", "5", "6", "7",    \
-          "8", "9", "10", "11", "12",      \
-          "lr", "ctr",  \
-          "cr0", "cr1", "cr5", "cr6", "cr7");  \
-      __result;  \
+#  define TLS_GD(x)							      \
+  ({ register int *__result __asm__ ("r3");				      \
+     asm ("addi  3,2," #x "@got@tlsgd\n\t"				      \
+	  "bl    " __TLS_GET_ADDR "\n\t"				      \
+	  "nop   "							      \
+	  : "=r" (__result) :						      \
+	  : __TLS_CALL_CLOBBERS);					      \
+     __result;								      \
   })
+# endif
 
 #elif !defined TLS_LE || !defined TLS_IE \
       || !defined TLS_LD || !defined TLS_GD
diff --git a/hurd/hurd/ioctl.h b/hurd/hurd/ioctl.h
index ee156f02f9..e5ab3dc965 100644
--- a/hurd/hurd/ioctl.h
+++ b/hurd/hurd/ioctl.h
@@ -57,7 +57,7 @@ extern int hurd_register_ioctl_handler (int first_request, int last_request,
   static const struct ioctl_handler handler##_ioctl_handler##moniker	      \
   	__attribute__ ((__unused__)) =					      \
     { _IOC_NOTYPE (first), _IOC_NOTYPE (last),				      \
-	(int (*) (int, int, void *)) (handler), NULL };	      		      \
+	(ioctl_handler_t) (handler), NULL };				      \
   text_set_element (_hurd_ioctl_handler_lists,				      \
                     handler##_ioctl_handler##moniker)
 #define	_HURD_HANDLE_IOCTLS(handler, first, last)			      \
diff --git a/hurd/hurdioctl.c b/hurd/hurdioctl.c
index 7c689841ca..04d98629ef 100644
--- a/hurd/hurdioctl.c
+++ b/hurd/hurdioctl.c
@@ -1,5 +1,5 @@
 /* ioctl commands which must be done in the C library.
-   Copyright (C) 1994,95,96,97,99,2001,2002,2009
+   Copyright (C) 1994,95,96,97,99,2001,2002,2009,2010
 	Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -239,34 +239,40 @@ _hurd_setcttyid (mach_port_t cttyid)
 }
 
 
-/* Make FD be the controlling terminal.
-   This function is called for `ioctl (fd, TCIOSCTTY)'.  */
-
-static int
-tiocsctty (int fd,
-	   int request)		/* Always TIOCSCTTY.  */
+static inline error_t
+do_tiocsctty (io_t port, io_t ctty)
 {
   mach_port_t cttyid;
   error_t err;
 
-  /* Get FD's cttyid port, unless it is already ours.  */
-  err = HURD_DPORT_USE (fd, ctty != MACH_PORT_NULL ? EADDRINUSE :
-			__term_getctty (port, &cttyid));
-  if (err == EADDRINUSE)
-    /* FD is already the ctty.  Nothing to do.  */
+  if (ctty != MACH_PORT_NULL)
+    /* PORT is already the ctty.  Nothing to do.  */
     return 0;
-  else if (err)
-    return __hurd_fail (err);
+
+  /* Get PORT's cttyid port.  */
+  err = __term_getctty (port, &cttyid);
+  if (err)
+    return err;
 
   /* Change the terminal's pgrp to ours.  */
-  err = HURD_DPORT_USE (fd, __tioctl_tiocspgrp (port, _hurd_pgrp));
+  err = __tioctl_tiocspgrp (port, _hurd_pgrp);
   if (err)
-    return __hurd_fail (err);
+    __mach_port_deallocate (__mach_task_self (), cttyid);
+  else
+    /* Make it our own.  */
+    install_ctty (cttyid);
 
-  /* Make it our own.  */
-  install_ctty (cttyid);
+  return err;
+}
 
-  return 0;
+/* Make FD be the controlling terminal.
+   This function is called for `ioctl (fd, TCIOSCTTY)'.  */
+
+static int
+tiocsctty (int fd,
+	   int request)		/* Always TIOCSCTTY.  */
+{
+  return __hurd_fail (HURD_DPORT_USE (fd, do_tiocsctty (port, ctty)));
 }
 _HURD_HANDLE_IOCTL (tiocsctty, TIOCSCTTY);
 
diff --git a/include/fenv.h b/include/fenv.h
index 3aec7e52bb..254162d45c 100644
--- a/include/fenv.h
+++ b/include/fenv.h
@@ -13,6 +13,7 @@ extern int __fesetenv (__const fenv_t *__envp);
 extern int __feupdateenv (__const fenv_t *__envp);
 
 libm_hidden_proto (feraiseexcept)
+libm_hidden_proto (fegetenv)
 libm_hidden_proto (fesetenv)
 libm_hidden_proto (fesetround)
 libm_hidden_proto (feholdexcept)
diff --git a/io/ftw.c b/io/ftw.c
index 9cc09077ed..bb7dba8ca8 100644
--- a/io/ftw.c
+++ b/io/ftw.c
@@ -1,5 +1,5 @@
 /* File tree walker functions.
-   Copyright (C) 1996-2004, 2006, 2007, 2008 Free Software Foundation, Inc.
+   Copyright (C) 1996-2004, 2006-2008, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -790,6 +790,7 @@ ftw_startup (const char *dir, int is_nftw, void *func, int descriptors,
     {
       int save_err = errno;
       __fchdir (cwdfd);
+      close_not_cancel_no_status (cwdfd);
       __set_errno (save_err);
     }
   else if (cwd != NULL)
diff --git a/math/fegetenv.c b/math/fegetenv.c
index 4a878cc41b..5b524307db 100644
--- a/math/fegetenv.c
+++ b/math/fegetenv.c
@@ -32,6 +32,7 @@ __fegetenv (fenv_t *envp)
 strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1);
 #endif
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2);
 
 stub_warning (fegetenv)
diff --git a/sysdeps/i386/fpu/fegetenv.c b/sysdeps/i386/fpu/fegetenv.c
index fb955cf565..ddb67e5d83 100644
--- a/sysdeps/i386/fpu/fegetenv.c
+++ b/sysdeps/i386/fpu/fegetenv.c
@@ -40,4 +40,5 @@ strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1);
 #endif
 
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2);
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index fbad9ae734..e8847d6fc4 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -7,7 +7,9 @@ ifeq ($(subdir),string)
 sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
 		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
-		   memset-sse2-rep bzero-sse2-rep
+		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+		   memcmp-ssse3 memcmp-sse4
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..b1ed778f1f
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1004 @@
+/* memcmp with SSE4.2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP		__memcmp_sse4_2
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		4
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define RETURN		POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+#ifdef SHARED
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+	PUSH (%ebx)
+	jb	L(less8bytes)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+	ALIGN (4)
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+L(nonzero):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(0bytes):
+	POP (%ebx)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(64bytesormore):
+	PUSH (%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+	ALIGN (4)
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(16bytes)
+
+	ALIGN (4)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	ALIGN (2)
+	.type	L(table_64bytes), @object
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.size	L(table_64bytes), .-L(table_64bytes)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..d2f852f726
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1966 @@
+/* memcmp with SSSE3
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP		__memcmp_ssse3
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		4
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+	PUSH (%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	ALIGN (4)
+	CFI_POP (%ebx)
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+
+	ALIGN (4)
+L(zero):
+	mov	$0, %eax
+	ret
+
+	ALIGN (4)
+L(48bytesormore):
+	PUSH (%ebx)
+	PUSH (%esi)
+	PUSH (%edi)
+	cfi_remember_state
+	movdqu    (%eax), %xmm3
+	movdqu    (%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb   %xmm0, %xmm3
+	pmovmskb  %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub      $0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	  L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	ALIGN (4)
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+
+	ALIGN (4)
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	 -9(%edi), %eax
+	movzbl	 -9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte16):
+	movzbl	 -16(%edi), %eax
+	movzbl	 -16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte17):
+	movzbl	 -15(%edi), %eax
+	movzbl	 -15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte18):
+	movzbl	 -14(%edi), %eax
+	movzbl	 -14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte19):
+	movzbl	 -13(%edi), %eax
+	movzbl	 -13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte20):
+	movzbl	 -12(%edi), %eax
+	movzbl	 -12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte21):
+	movzbl	 -11(%edi), %eax
+	movzbl	 -11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte22):
+	movzbl	 -10(%edi), %eax
+	movzbl	 -10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	ALIGN (4)
+L(Byte31):
+	movzbl	 -9(%edi), %eax
+	movzbl	 -9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+
+	CFI_PUSH (%ebx)
+	ALIGN (4)
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+
+	ALIGN (4)
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+
+	ALIGN (4)
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+
+	ALIGN (4)
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+
+	ALIGN (4)
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	ALIGN (4)
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+
+	ALIGN (4)
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+
+END (MEMCMP)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..cf606a5959
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,88 @@
+/* Multiple versions of memcmp
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memcmp_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcmp_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcmp_sse4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(memcmp)
+# else
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memcmp_ia32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__memcmp_ssse3, %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
+	jz	2f
+	leal	__memcmp_sse4_2, %eax
+2:	ret
+END(memcmp)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_ia32, @function; \
+	.p2align 4; \
+	__memcmp_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..d5fd23e15c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,378 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+#else
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp)
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (STRCMP)
+#ifdef USE_AS_STRNCMP
+	PUSH	(%ebp)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#ifdef USE_AS_STRNCMP
+	movl	CNT(%esp), %ebp
+	test	%ebp, %ebp
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+	jne	L(less4bytes)
+	movdqu	(%eax), %xmm1
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %ebp
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	sub	$8, %ebp
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+	PUSH	(%ebx)
+	PUSH	(%edi)
+	PUSH	(%esi)
+	cfi_remember_state
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %ebx
+	movl	%esi, %ecx
+	andl	$0xfff, %ebx
+	andl	$0xfff, %ecx
+	cmpl	%ebx, %ecx
+	cmovl	%ebx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %ebp
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ebx
+	subl	%ebx, %eax
+	jne	L(ret)
+	testl	%ebx, %ebx
+	je	L(ret)
+#ifdef USE_AS_STRNCMP
+	sub	$1, %ebp
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	$16, %edi
+	add	$16, %esi
+	jmp	L(check_offset)
+
+	.p2align 4
+L(end):
+	jnc	L(ret)
+#ifdef USE_AS_STRNCMP
+	sub	%ecx, %ebp
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ebx
+	movzbl	(%edi,%ebx), %eax
+	movzbl	(%esi,%ebx), %ecx
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+
+	.p2align 4
+	cfi_restore_state
+#ifdef USE_AS_STRNCMP
+L(more16byteseq):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#endif
+L(eq):
+	xorl	%eax, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+	xor	(%edx), %ecx
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(eq)
+#endif
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+	xor	4(%edx), %ecx
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#ifdef USE_AS_STRNCMP
+	sub	$8, %ebp
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	jmp	L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..40994c05b1
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2220 @@
+/* strcmp with SSSE3
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strcmp_ssse3
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+#else
+# ifndef STRCMP
+#  define STRCMP	__strncmp_ssse3
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp)
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, %ebp;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, %ebp
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCMP)
+#ifdef USE_AS_STRNCMP
+	PUSH	(%ebp)
+#endif
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#ifdef USE_AS_STRNCMP
+	movl	CNT(%esp), %ebp
+	cmp	$16, %ebp
+	jb	L(less16bytes_sncmp)
+	jmp	L(more16bytes)
+#endif
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	lea	-8(%ebp), %ebp
+	je	L(eq)
+L(more16bytes):
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+	PUSH	(%ebx)
+	PUSH	(%edi)
+	PUSH	(%esi)
+#ifdef USE_AS_STRNCMP
+	cfi_remember_state
+#endif
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+	xor	%ebx, %ebx
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	or	$0x20, %ebx
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx), %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	mov	$0x10, %ebx
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$1, %ebx
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$15, %ebp
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$2, %ebx
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$14, %ebp
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$3, %ebx
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$13, %ebp
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$4, %ebx
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$12, %ebp
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$5, %ebx
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$11, %ebp
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$6, %ebx
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$10, %ebp
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$7, %ebx
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$9, %ebp
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$8, %ebx
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$9, %ebx
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$10, %ebx
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$11, %ebx
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$12, %ebx
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$13, %ebx
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$14, %ebx
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$15, %ebx
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	%ebx, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	test	$0x20, %ebx
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte0):
+#ifdef USE_AS_STRNCMP
+	cmp	$0, %ebp
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte1):
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte2):
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte3):
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte4):
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte5):
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte6):
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	lea	-8(%ebp), %ebp
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+
+#ifdef USE_AS_STRNCMP
+	.p2align 4
+	cfi_restore_state
+L(more8byteseq):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#endif
+
+L(eq):
+
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	xorl	%eax, %eax
+	ret
+
+#ifdef USE_AS_STRNCMP
+	.p2align 4
+	CFI_PUSH (%ebp)
+L(less16bytes_sncmp):
+	test	%ebp, %ebp
+	jz	L(eq)
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$1, %ebp
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$2, %ebp
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$3, %ebp
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$4, %ebp
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$5, %ebp
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$6, %ebp
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$7, %ebp
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+
+	cmp	$8, %ebp
+	je	L(eq)
+
+	movzbl	8(%eax), %ecx
+	cmpb	%cl, 8(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$9, %ebp
+	je	L(eq)
+
+	movzbl	9(%eax), %ecx
+	cmpb	%cl, 9(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$10, %ebp
+	je	L(eq)
+
+	movzbl	10(%eax), %ecx
+	cmpb	%cl, 10(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$11, %ebp
+	je	L(eq)
+
+	movzbl	11(%eax), %ecx
+	cmpb	%cl, 11(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+
+	cmp	$12, %ebp
+	je	L(eq)
+
+	movzbl	12(%eax), %ecx
+	cmpb	%cl, 12(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$13, %ebp
+	je	L(eq)
+
+	movzbl	13(%eax), %ecx
+	cmpb	%cl, 13(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$14, %ebp
+	je	L(eq)
+
+	movzbl	14(%eax), %ecx
+	cmpb	%cl, 14(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$15, %ebp
+	je	L(eq)
+
+	movzbl	15(%eax), %ecx
+	cmpb	%cl, 15(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	POP	(%ebp)
+	xor	%eax, %eax
+	ret
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..7136d47e85
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,115 @@
+/* Multiple versions of strcmp
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCMP
+# define STRCMP			strcmp
+# define __GI_STRCMP		__GI_strcmp
+# define __STRCMP_IA32		__strcmp_ia32
+# define __STRCMP_SSSE3		__strcmp_ssse3
+# define __STRCMP_SSE4_2	__strcmp_sse4_2
+#else
+# define STRCMP			strncmp
+# define __GI_STRCMP		__GI_strncmp
+# define __STRCMP_IA32		__strncmp_ia32
+# define __STRCMP_SSSE3		__strncmp_ssse3
+# define __STRCMP_SSE4_2	__strncmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__STRCMP_IA32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__STRCMP_SSSE3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__STRCMP_SSE4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(STRCMP)
+# else
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__STRCMP_IA32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__STRCMP_SSSE3, %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
+	jz	2f
+	leal	__STRCMP_SSE4_2, %eax
+2:	ret
+END(STRCMP)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __STRCMP_IA32, @function; \
+	.p2align 4; \
+	__STRCMP_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#ifndef USE_AS_STRNCMP
+# include "../strcmp.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncmp-c.c b/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncmp.S b/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..b6814315fb
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCMP
+#define STRCMP	strncmp
+#include "strcmp.S"
diff --git a/sysdeps/i386/lshift.S b/sysdeps/i386/lshift.S
index 536d9878eb..398cf038c7 100644
--- a/sysdeps/i386/lshift.S
+++ b/sysdeps/i386/lshift.S
@@ -1,5 +1,5 @@
 /* i80386 __mpn_lshift --
-   Copyright (C) 1992, 1994, 1997-2000, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -55,6 +55,7 @@ ENTRY (BP_SYM (__mpn_lshift))
 
 	movl	(%esi,%edx,4),%ebx	/* read most significant limb */
 	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
 	xorl	%eax,%eax
 	shldl	%cl,%ebx,%eax		/* compute carry limb */
 	decl	%edx
@@ -95,6 +96,7 @@ L(1):	movl	(%esi,%edx,4),%eax
 	LEAVE
 	ret
 
+	cfi_restore_state
 L(end):	shll	%cl,%ebx		/* compute least significant limb */
 	movl	%ebx,(%edi)		/* store it */
 
diff --git a/sysdeps/i386/rshift.S b/sysdeps/i386/rshift.S
index 3fd0afe822..332c4d09e7 100644
--- a/sysdeps/i386/rshift.S
+++ b/sysdeps/i386/rshift.S
@@ -1,5 +1,5 @@
 /* i80386 __mpn_rshift --
-   Copyright (C) 1992,1994,1997-2000,2005 Free Software Foundation, Inc.
+   Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -57,6 +57,7 @@ ENTRY (BP_SYM (__mpn_rshift))
 
 	movl	(%esi,%edx,4),%ebx	/* read least significant limb */
 	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
 	xorl	%eax,%eax
 	shrdl	%cl,%ebx,%eax		/* compute carry limb */
 	incl	%edx
@@ -97,10 +98,7 @@ L(1):	movl	(%esi,%edx,4),%eax
 	LEAVE
 	ret
 
-	cfi_adjust_cfa_offset (12)
-	cfi_rel_offset (edi, 8)
-	cfi_rel_offset (esi, 4)
-	cfi_rel_offset (ebx, 0)
+	cfi_restore_state
 L(end):	shrl	%cl,%ebx		/* compute most significant limb */
 	movl	%ebx,(%edi)		/* store it */
 
diff --git a/sysdeps/ia64/fpu/fegetenv.c b/sysdeps/ia64/fpu/fegetenv.c
index 5446b16494..e240f75e43 100644
--- a/sysdeps/ia64/fpu/fegetenv.c
+++ b/sysdeps/ia64/fpu/fegetenv.c
@@ -27,3 +27,4 @@ fegetenv (fenv_t *envp)
 
   return 0;
 }
+libm_hidden_def (fegetenv)
diff --git a/sysdeps/powerpc/fpu/fegetenv.c b/sysdeps/powerpc/fpu/fegetenv.c
index 53953454cb..3d21abb529 100644
--- a/sysdeps/powerpc/fpu/fegetenv.c
+++ b/sysdeps/powerpc/fpu/fegetenv.c
@@ -35,4 +35,5 @@ strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1);
 #endif
 
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2);
diff --git a/sysdeps/powerpc/powerpc32/configure b/sysdeps/powerpc/powerpc32/configure
index 9b76c57886..da8ec0b87c 100644
--- a/sysdeps/powerpc/powerpc32/configure
+++ b/sysdeps/powerpc/powerpc32/configure
@@ -25,11 +25,10 @@ rm -f conftest*
 fi
 { $as_echo "$as_me:$LINENO: result: $libc_cv_ppc_rel16" >&5
 $as_echo "$libc_cv_ppc_rel16" >&6; }
-if test $libc_cv_ppc_rel16 = yes; then
-  cat >>confdefs.h <<\_ACEOF
-#define HAVE_ASM_PPC_REL16 1
-_ACEOF
-
+if test $libc_cv_ppc_rel16 = no; then
+  { { $as_echo "$as_me:$LINENO: error: R_PPC_REL16 is not supported. Binutils is too old." >&5
+$as_echo "$as_me: error: R_PPC_REL16 is not supported. Binutils is too old." >&2;}
+   { (exit 1); exit 1; }; }
 fi
 
 # See whether GCC uses -msecure-plt.
diff --git a/sysdeps/powerpc/powerpc32/configure.in b/sysdeps/powerpc/powerpc32/configure.in
index 7219ad993e..21d3f5ee5b 100644
--- a/sysdeps/powerpc/powerpc32/configure.in
+++ b/sysdeps/powerpc/powerpc32/configure.in
@@ -13,8 +13,8 @@ else
   libc_cv_ppc_rel16=no
 fi
 rm -f conftest*])
-if test $libc_cv_ppc_rel16 = yes; then
-  AC_DEFINE(HAVE_ASM_PPC_REL16)
+if test $libc_cv_ppc_rel16 = no; then
+  AC_MSG_ERROR(R_PPC_REL16 is not supported. Binutils is too old.)
 fi
 
 # See whether GCC uses -msecure-plt.
diff --git a/sysdeps/powerpc/powerpc32/dl-machine.h b/sysdeps/powerpc/powerpc32/dl-machine.h
index 6f8d0f506e..5351d9691d 100644
--- a/sysdeps/powerpc/powerpc32/dl-machine.h
+++ b/sysdeps/powerpc/powerpc32/dl-machine.h
@@ -41,16 +41,13 @@ static inline Elf32_Addr * __attribute__ ((const))
 ppc_got (void)
 {
   Elf32_Addr *got;
-#ifdef HAVE_ASM_PPC_REL16
+
   asm ("bcl 20,31,1f\n"
        "1:	mflr %0\n"
        "	addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n"
        "	addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n"
        : "=b" (got) : : "lr");
-#else
-  asm (" bl _GLOBAL_OFFSET_TABLE_-4@local"
-       : "=l" (got));
-#endif
+
   return got;
 }
 
diff --git a/sysdeps/powerpc/powerpc32/dl-start.S b/sysdeps/powerpc/powerpc32/dl-start.S
index c77c4de198..ae41f47ede 100644
--- a/sysdeps/powerpc/powerpc32/dl-start.S
+++ b/sysdeps/powerpc/powerpc32/dl-start.S
@@ -47,15 +47,10 @@ _dl_start_user:
    passed by value!).  */
 
 /*  Put our GOT pointer in r31, */
-#ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r31
 	addis	r31,r31,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r31,r31,_GLOBAL_OFFSET_TABLE_-1b@l
-#else
-	bl	_GLOBAL_OFFSET_TABLE_-4@local
-	mflr	r31
-#endif
 /*  the address of _start in r30, */
 	mr	r30,r3
 /*  &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28.  */
diff --git a/sysdeps/powerpc/powerpc32/elf/start.S b/sysdeps/powerpc/powerpc32/elf/start.S
index a8abdca0c6..dc89a5e109 100644
--- a/sysdeps/powerpc/powerpc32/elf/start.S
+++ b/sysdeps/powerpc/powerpc32/elf/start.S
@@ -53,10 +53,6 @@ L(start_addresses):
 	ASM_SIZE_DIRECTIVE(L(start_addresses))
 
 	.section ".text"
-#if defined PIC && !defined HAVE_ASM_PPC_REL16
-L(start_addressesp):
-	.long	L(start_addresses)-L(branch)
-#endif
 ENTRY(_start)
  /* Save the stack pointer, in case we're statically linked under Linux.  */
 	mr	r9,r1
@@ -77,16 +73,10 @@ L(branch):
     start_addresses in r8.  Also load the GOT pointer so that new PLT
     calls work, like the one to __libc_start_main.  */
 #ifdef PIC
-# ifdef HAVE_ASM_PPC_REL16
 	addis	r30,r13,_GLOBAL_OFFSET_TABLE_-L(branch)@ha
 	addis	r8,r13,L(start_addresses)-L(branch)@ha
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-L(branch)@l
 	lwzu	r13,L(start_addresses)-L(branch)@l(r8)
-# else
-	lwz	r8,L(start_addressesp)-L(branch)(r13)
-	add	r8,r13,r8
-	lwz	r13,0(r8)
-# endif
 #else
 	lis	r8,L(start_addresses)@ha
 	lwzu	r13,L(start_addresses)@l(r8)
diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
index 04ed6da68b..e1ac064a59 100644
--- a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
@@ -34,15 +34,10 @@ ENTRY (BP_SYM (__longjmp))
 # ifdef PIC
 	mflr    r6
 	cfi_register (lr,r6)
-#  ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r5
 	addis	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l
-#  else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r5
-#  endif
 #  ifdef SHARED
 	lwz     r5,_rtld_global_ro@got(r5)
 	mtlr    r6
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S
index bc74d302fb..80e72ca2bd 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S
@@ -31,17 +31,10 @@ ENTRY (__ceil)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S
index 47a75ec0c3..ce6d71e4f8 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S
@@ -30,17 +30,10 @@ ENTRY (__ceilf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floor.S b/sysdeps/powerpc/powerpc32/fpu/s_floor.S
index a29e4791ea..0dd0dbe6c0 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_floor.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_floor.S
@@ -31,17 +31,10 @@ ENTRY (__floor)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S
index 99fbdc5f86..98a47458bc 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S
@@ -30,17 +30,10 @@ ENTRY (__floorf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
index d73749e134..3bf1ffaea1 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_lround.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
@@ -45,17 +45,10 @@ ENTRY (__lround)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp10,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp10,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rint.S b/sysdeps/powerpc/powerpc32/fpu/s_rint.S
index c8dca313ae..93133718ad 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_rint.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_rint.S
@@ -33,17 +33,10 @@ ENTRY (__rint)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S
index 7771cb2bc8..1e0fbb1f0d 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S
@@ -29,17 +29,10 @@ ENTRY (__rintf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_round.S b/sysdeps/powerpc/powerpc32/fpu/s_round.S
index 590c87ad8c..48b346e651 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_round.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_round.S
@@ -43,16 +43,10 @@ ENTRY (__round)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	addi	r9,r9,.LC0-1b@l
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 	lfs	fp13,0(r9)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
index 7e99bca315..88125aad06 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
@@ -42,16 +42,10 @@ ENTRY (__roundf )
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	addi	r9,r9,.LC0-1b@l
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 	lfs	fp13,0(r9)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S
index 5bc0856b9f..c3c021716a 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S
@@ -38,17 +38,10 @@ ENTRY (__trunc)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S
index e2e3bd6740..eddef070cd 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S
@@ -37,17 +37,10 @@ ENTRY (__truncf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
index b7d1abc00d..131e7a332e 100644
--- a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
@@ -85,15 +85,10 @@ ENTRY (BP_SYM (__sigsetjmp))
 # ifdef PIC
 	mflr    r6
 	cfi_register(lr,r6)
-#  ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r5
 	addis	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l
-#  else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r5
-#  endif
 	mtlr	r6
 	cfi_same_value (lr)
 #  ifdef SHARED
diff --git a/sysdeps/powerpc/powerpc32/memset.S b/sysdeps/powerpc/powerpc32/memset.S
index 454abb2b65..b4ce218e24 100644
--- a/sysdeps/powerpc/powerpc32/memset.S
+++ b/sysdeps/powerpc/powerpc32/memset.S
@@ -256,17 +256,10 @@ L(checklinesize):
 	beq	L(medium)
 /* Establishes GOT addressability so we can load __cache_line_size
    from static. This value was set from the aux vector during startup.  */
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	rGOT
 	addis	rGOT,rGOT,__cache_line_size-1b@ha
 	lwz	rCLS,__cache_line_size-1b@l(rGOT)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	rGOT
-	lwz	rGOT,__cache_line_size@got(rGOT)
-	lwz	rCLS,0(rGOT)
-# endif
 	mtlr	rTMP
 #else
 /* Load __cache_line_size from static. This value was set from the
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
index e10a37977a..b03e041d8a 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
@@ -53,16 +53,10 @@ ENTRY (__llround)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	addi	r9,r9,.LC0-1b@l
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 	lfd	fp9,0(r9)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S
index 95a0b3915d..8be3cf1848 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S
@@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S
index c31555194b..9fa282c162 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S
@@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S
index 105b5912a1..27a1a0dcbb 100644
--- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S
+++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S
@@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S
index 14bc0a2ceb..8914855542 100644
--- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S
+++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S
@@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power7/Implies b/sysdeps/powerpc/powerpc32/power7/Implies
deleted file mode 100644
index 03899d8a3c..0000000000
--- a/sysdeps/powerpc/powerpc32/power7/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc32/power5
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/powerpc/powerpc32/power7/fpu/Implies
deleted file mode 100644
index 819a7d7979..0000000000
--- a/sysdeps/powerpc/powerpc32/power7/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..5b0d950c74
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -0,0 +1,89 @@
+/* finite().  PowerPC32/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+ENTRY (__finite)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	bcl	20,31,1f
+1:	mflr	r9
+	addis	r9,r9,.LC0-1b@ha
+	lfd	fp0,.LC0-1b@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,1
+	bflr	30
+
+	/* We have -INF/+INF/NaN or a denormal.  */
+
+	stwu	r1,-16(r1)    /* Allocate stack space.  */
+	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
+
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz     r0,8(r1)      /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	clrlwi	r0,r0,17      /* r0 = abs(r0).  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	cmpwi	cr7,r0,0x7ff0 /* r4 == 0x7ff0?.  */
+	bltlr	cr7	      /* LT means we have a denormal.  */
+	li	r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__finite, __finitel)
+weak_alias (__finite, finitel)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..2979534911
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -0,0 +1,88 @@
+/* isinf().  PowerPC32/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+ENTRY (__isinf)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	bcl	20,31,1f
+1:	mflr	r9
+	addis	r9,r9,.LC0-1b@ha
+	lfd	fp0,.LC0-1b@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have -INF/+INF or a denormal.  */
+
+	stwu    r1,-16(r1)    /* Allocate stack space.  */
+	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,8(r1)      /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..852539f24b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -0,0 +1,92 @@
+/* isnan().  PowerPC32/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+ENTRY (__isnan)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	bcl	20,31,1f
+1:	mflr	r9
+	addis	r9,r9,.LC0-1b@ha
+	lfd	fp0,.LC0-1b@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN or Inf, finish. */
+
+	/* We have -INF/+INF/NaN or a denormal.  */
+
+	stwu	r1,-16(r1)    /* Allocate stack space.  */
+	stfd	fp1,8(r1)     /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lwz     r4,8(r1)      /* Load the upper half of the FP value.  */
+	lwz     r5,12(r1)     /* Load the lower half of the FP value.  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	lis     r0,0x7ff0     /* Load the upper portion for an INF/NaN.  */
+	clrlwi  r4,r4,1	      /* r4 = abs(r4).  */
+	cmpw    cr7,r4,r0     /* if (abs(r4) <= inf).  */
+	cmpwi   cr6,r5,0      /* r5 == 0x00000000?  */
+	bltlr	cr7	      /* LT means we have a denormal.  */
+	bgt	cr7,L(NaN)    /* GT means we have a NaN.  */
+	beqlr	cr6	      /* EQ means we have +/-INF.  */
+L(NaN):
+	li      r3,1	      /* x == NaN?  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies
deleted file mode 100644
index 13b03309fb..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power5
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
deleted file mode 100644
index 13b03309fb..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power5
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..6763d1adc8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -0,0 +1,68 @@
+/* finite().  PowerPC64/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+EALIGN (__finite, 4, 0)
+	CALL_MCOUNT 0
+	lfd     fp0,.LC0@toc(r2)
+	ftdiv   cr7,fp1,fp0
+	li	r3,1
+	bflr    30
+
+	/* If we are here, we either have +/-INF,
+	NaN or denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+
+	lhz     r4,-16(r1)    /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	clrlwi  r4,r4,17      /* r4 = abs(r4).  */
+	cmpwi   cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	bltlr   cr7	      /* LT means finite, other non-finite.  */
+	li      r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..f896d38026
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -0,0 +1,71 @@
+/* isinf().  PowerPC64/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+EALIGN (__isinf, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have -INF/+INF or a denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,-16(r1)    /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..8877012598
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -0,0 +1,69 @@
+/* isnan().  PowerPC64/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+EALIGN (__isnan, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN, finish.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	ld	r4,-16(r1)    /* Load FP into GPR.  */
+	lis     r0,0x7ff0
+	sldi	r0,r0,32      /* const long r0 0x7ff00000 00000000.  */
+	clrldi	r4,r4,1	      /* x = fabs(x)  */
+	cmpd	cr7,r4,r0     /* if (fabs(x) <= inf)  */
+	blelr	cr7	      /* LE means not NaN.  */
+	li	r3,1	      /* else return 1  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/sysdeps/s390/fpu/fegetenv.c b/sysdeps/s390/fpu/fegetenv.c
index a244f2ca8b..04da54c94c 100644
--- a/sysdeps/s390/fpu/fegetenv.c
+++ b/sysdeps/s390/fpu/fegetenv.c
@@ -20,10 +20,6 @@
 
 #include <fenv_libc.h>
 #include <fpu_control.h>
-#include <stddef.h>
-#include <asm/ptrace.h>
-#include <sys/ptrace.h>
-#include <unistd.h>
 
 int
 fegetenv (fenv_t *envp)
@@ -33,3 +29,4 @@ fegetenv (fenv_t *envp)
   /* Success.  */
   return 0;
 }
+libm_hidden_def (fegetenv)
diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c
index 868dea68ca..14daf2118f 100644
--- a/sysdeps/s390/s390-64/utf16-utf32-z9.c
+++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c
@@ -203,7 +203,10 @@ gconv_end (struct __gconv_step *data)
    swapping).  */
 #define BODY								\
   {									\
-    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+    /* The hardware instruction currently fails to report an error for	\
+       isolated low surrogates so we have to disable the instruction	\
+       until this gets resolved.  */					\
+    if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */			\
       {									\
 	HARDWARE_CONVERT ("cu24 %0, %1, 1");				\
 	if (inptr != inend)						\
@@ -229,6 +232,12 @@ gconv_end (struct __gconv_step *data)
       }									\
     else								\
       {									\
+        /* An isolated low-surrogate was found.  This has to be         \
+	   considered ill-formed.  */					\
+        if (__builtin_expect (u1 >= 0xdc00, 0))				\
+	  {								\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (2);				\
+	  }								\
 	/* It's a surrogate character.  At least the first word says	\
 	   it is.  */							\
 	if (__builtin_expect (inptr + 4 > inend, 0))			\
diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c
index 531d3ebd4b..5f73f3c59e 100644
--- a/sysdeps/s390/s390-64/utf8-utf16-z9.c
+++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c
@@ -345,9 +345,12 @@ gconv_end (struct __gconv_step *data)
    Operation.  */
 #define BODY								\
   {									\
-    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+    /* The hardware instruction currently fails to report an error for	\
+       isolated low surrogates so we have to disable the instruction	\
+       until this gets resolved.  */					\
+    if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */			\
       {									\
-	HARDWARE_CONVERT ("cu21 %0, %1");				\
+	HARDWARE_CONVERT ("cu21 %0, %1, 1");				\
 	if (inptr != inend)						\
 	  {								\
 	    /* Check if the third byte is				\
@@ -388,7 +391,7 @@ gconv_end (struct __gconv_step *data)
 									\
 	outptr += 2;							\
       }									\
-    else if (c >= 0x0800 && c <= 0xd7ff)				\
+    else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff)		\
       {									\
 	/* Three byte UTF-8 char.  */					\
 									\
diff --git a/sysdeps/sh/sh4/fpu/fegetenv.c b/sysdeps/sh/sh4/fpu/fegetenv.c
index c07b32af30..683939b52d 100644
--- a/sysdeps/sh/sh4/fpu/fegetenv.c
+++ b/sysdeps/sh/sh4/fpu/fegetenv.c
@@ -29,3 +29,4 @@ fegetenv (fenv_t *envp)
 
   return 0;
 }
+libm_hidden_def (fegetenv)
diff --git a/sysdeps/sparc/fpu/fegetenv.c b/sysdeps/sparc/fpu/fegetenv.c
index 36486f5973..c606a9cac0 100644
--- a/sysdeps/sparc/fpu/fegetenv.c
+++ b/sysdeps/sparc/fpu/fegetenv.c
@@ -34,4 +34,5 @@ strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
 #endif
 
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/sysdeps/sparc/sparc32/dl-irel.h b/sysdeps/sparc/sparc32/dl-irel.h
new file mode 100644
index 0000000000..1891938d6d
--- /dev/null
+++ b/sysdeps/sparc/sparc32/dl-irel.h
@@ -0,0 +1,55 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   SPARC 32-bit version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+#include <dl-plt.h>
+
+#define ELF_MACHINE_IRELA	1
+
+static inline void
+__attribute ((always_inline))
+elf_irela (const Elf32_Rela *reloc)
+{
+  unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1))
+    {
+      Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) ();
+      *reloc_addr = value;
+    }
+  else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1))
+    {
+      Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) ();
+
+      sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
+    }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h
index b3b7852d87..e1385f7aca 100644
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@@ -1,5 +1,5 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  SPARC version.
-   Copyright (C) 1996-2003, 2004, 2005, 2006, 2007
+   Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -27,20 +27,13 @@
 #include <sys/param.h>
 #include <ldsodefs.h>
 #include <tls.h>
+#include <dl-plt.h>
 
 #ifndef VALIDX
 # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \
 		      + DT_EXTRANUM + DT_VALTAGIDX (tag))
 #endif
 
-/* Some SPARC opcodes we need to use for self-modifying code.  */
-#define OPCODE_NOP	0x01000000 /* nop */
-#define OPCODE_CALL	0x40000000 /* call ?; add PC-rel word address */
-#define OPCODE_SETHI_G1	0x03000000 /* sethi ?, %g1; add value>>10 */
-#define OPCODE_JMP_G1	0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
-#define OPCODE_SAVE_SP	0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
-#define OPCODE_BA	0x30800000 /* b,a ?; add PC-rel word address */
-
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int
 elf_machine_matches_host (const Elf32_Ehdr *ehdr)
@@ -312,41 +305,6 @@ _dl_start_user:\n\
 	.size   _dl_start_user, . - _dl_start_user\n\
 	.previous");
 
-static inline __attribute__ ((always_inline)) Elf32_Addr
-sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
-		 Elf32_Addr value, int t, int do_flush)
-{
-  Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
-
-  if (0 && disp >= -0x800000 && disp < 0x800000)
-    {
-      /* Don't need to worry about thread safety. We're writing just one
-	 instruction.  */
-
-      reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
-      if (do_flush)
-	__asm __volatile ("flush %0" : : "r"(reloc_addr));
-    }
-  else
-    {
-      /* For thread safety, write the instructions from the bottom and
-	 flush before we overwrite the critical "b,a".  This of course
-	 need not be done during bootstrapping, since there are no threads.
-	 But we also can't tell if we _can_ use flush, so don't. */
-
-      reloc_addr += t;
-      reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
-      if (do_flush)
-	__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
-
-      reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10);
-      if (do_flush)
-	__asm __volatile ("flush %0" : : "r"(reloc_addr));
-    }
-
-  return value;
-}
-
 static inline Elf32_Addr
 elf_machine_fixup_plt (struct link_map *map, lookup_t t,
 		       const Elf32_Rela *reloc,
@@ -433,6 +391,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
 
   value += reloc->r_addend;	/* Assume copy relocs have zero addend.  */
 
+  if (sym != NULL
+      && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+      && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+    {
+      value = ((Elf32_Addr (*) (void)) value) ();
+    }
+
   switch (r_type)
     {
 #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP
@@ -460,6 +425,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
     case R_SPARC_32:
       *reloc_addr = value;
       break;
+    case R_SPARC_IRELATIVE:
+      value = ((Elf32_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+      break;
+    case R_SPARC_JMP_IREL:
+      value = ((Elf32_Addr (*) (void)) value) ();
+      /* Fall thru */
     case R_SPARC_JMP_SLOT:
       {
 #if !defined RTLD_BOOTSTRAP && !defined __sparc_v9__
@@ -578,16 +550,21 @@ __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map,
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc)
 {
-  switch (ELF32_R_TYPE (reloc->r_info))
+  Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1))
+    ;
+  else if (r_type == R_SPARC_JMP_IREL)
     {
-    case R_SPARC_NONE:
-      break;
-    case R_SPARC_JMP_SLOT:
-      break;
-    default:
-      _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1);
-      break;
+      Elf32_Addr value = map->l_addr + reloc->r_addend;
+      value = ((Elf32_Addr (*) (void)) value) ();
+      sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
     }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
 }
 
 #endif	/* RESOLVE_MAP */
diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h
new file mode 100644
index 0000000000..edcc5c1374
--- /dev/null
+++ b/sysdeps/sparc/sparc32/dl-plt.h
@@ -0,0 +1,62 @@
+/* PLT fixups.  Sparc 32-bit version.
+   Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010
+   Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* Some SPARC opcodes we need to use for self-modifying code.  */
+#define OPCODE_NOP	0x01000000 /* nop */
+#define OPCODE_CALL	0x40000000 /* call ?; add PC-rel word address */
+#define OPCODE_SETHI_G1	0x03000000 /* sethi ?, %g1; add value>>10 */
+#define OPCODE_JMP_G1	0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
+#define OPCODE_SAVE_SP	0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
+#define OPCODE_BA	0x30800000 /* b,a ?; add PC-rel word address */
+
+static inline __attribute__ ((always_inline)) Elf32_Addr
+sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
+		 Elf32_Addr value, int t, int do_flush)
+{
+  Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
+
+  if (0 && disp >= -0x800000 && disp < 0x800000)
+    {
+      /* Don't need to worry about thread safety. We're writing just one
+	 instruction.  */
+
+      reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+      if (do_flush)
+	__asm __volatile ("flush %0" : : "r"(reloc_addr));
+    }
+  else
+    {
+      /* For thread safety, write the instructions from the bottom and
+	 flush before we overwrite the critical "b,a".  This of course
+	 need not be done during bootstrapping, since there are no threads.
+	 But we also can't tell if we _can_ use flush, so don't. */
+
+      reloc_addr += t;
+      reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
+      if (do_flush)
+	__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
+
+      reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10);
+      if (do_flush)
+	__asm __volatile ("flush %0" : : "r"(reloc_addr));
+    }
+
+  return value;
+}
diff --git a/sysdeps/sparc/sparc64/dl-irel.h b/sysdeps/sparc/sparc64/dl-irel.h
new file mode 100644
index 0000000000..1a2a0a3dd5
--- /dev/null
+++ b/sysdeps/sparc/sparc64/dl-irel.h
@@ -0,0 +1,58 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   SPARC 64-bit version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+#include <dl-plt.h>
+
+#define ELF_MACHINE_IRELA	1
+
+static inline void
+__attribute ((always_inline))
+elf_irela (const Elf64_Rela *reloc)
+{
+  unsigned int r_type = (reloc->r_info & 0xff);
+
+  if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1))
+    {
+      Elf64_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) ();
+      *reloc_addr = value;
+    }
+  else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1))
+    {
+      Elf64_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) ();
+      struct link_map map = { .l_addr = 0 };
+
+      /* 'high' is always zero, for large PLT entries the linker
+	 emits an R_SPARC_IRELATIVE.  */
+      sparc64_fixup_plt (&map, reloc, reloc_addr, value, 0, 0);
+    }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h
index 3eee672912..b4f43e9cf5 100644
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@@ -1,6 +1,6 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  Sparc64 version.
-   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
-	Free Software Foundation, Inc.
+   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
+	2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -27,6 +27,7 @@
 #include <sys/param.h>
 #include <ldsodefs.h>
 #include <sysdep.h>
+#include <dl-plt.h>
 
 #ifndef VALIDX
 # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \
@@ -89,132 +90,6 @@ elf_machine_load_address (void)
   return (Elf64_Addr) got - *got + (Elf32_Sword) ((pc[2] - pc[3]) * 4) - 4;
 }
 
-/* We have 4 cases to handle.  And we code different code sequences
-   for each one.  I love V9 code models...  */
-static inline void __attribute__ ((always_inline))
-sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
-		   Elf64_Addr *reloc_addr, Elf64_Addr value,
-		   Elf64_Addr high, int t)
-{
-  unsigned int *insns = (unsigned int *) reloc_addr;
-  Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
-  Elf64_Sxword disp = value - plt_vaddr;
-
-  /* Now move plt_vaddr up to the call instruction.  */
-  plt_vaddr += ((t + 1) * 4);
-
-  /* PLT entries .PLT32768 and above look always the same.  */
-  if (__builtin_expect (high, 0) != 0)
-    {
-      *reloc_addr = value - map->l_addr;
-    }
-  /* Near destination.  */
-  else if (disp >= -0x800000 && disp < 0x800000)
-    {
-      /* As this is just one instruction, it is thread safe and so
-	 we can avoid the unnecessary sethi FOO, %g1.
-	 b,a target  */
-      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-  /* 32-bit Sparc style, the target is in the lower 32-bits of
-     address space.  */
-  else if (insns += t, (value >> 32) == 0)
-    {
-      /* sethi	%hi(target), %g1
-	 jmpl	%g1 + %lo(target), %g0  */
-
-      insns[1] = 0x81c06000 | (value & 0x3ff);
-      __asm __volatile ("flush %0 + 4" : : "r" (insns));
-
-      insns[0] = 0x03000000 | ((unsigned int)(value >> 10));
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-  /* We can also get somewhat simple sequences if the distance between
-     the target and the PLT entry is within +/- 2GB.  */
-  else if ((plt_vaddr > value
-	    && ((plt_vaddr - value) >> 31) == 0)
-	   || (value > plt_vaddr
-	       && ((value - plt_vaddr) >> 31) == 0))
-    {
-      unsigned int displacement;
-
-      if (plt_vaddr > value)
-	displacement = (0 - (plt_vaddr - value));
-      else
-	displacement = value - plt_vaddr;
-
-      /* mov	%o7, %g1
-	 call	displacement
-	  mov	%g1, %o7  */
-
-      insns[2] = 0x9e100001;
-      __asm __volatile ("flush %0 + 8" : : "r" (insns));
-
-      insns[1] = 0x40000000 | (displacement >> 2);
-      __asm __volatile ("flush %0 + 4" : : "r" (insns));
-
-      insns[0] = 0x8210000f;
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-  /* Worst case, ho hum...  */
-  else
-    {
-      unsigned int high32 = (value >> 32);
-      unsigned int low32 = (unsigned int) value;
-
-      /* ??? Some tricks can be stolen from the sparc64 egcs backend
-	     constant formation code I wrote.  -DaveM  */
-
-      if (__builtin_expect (high32 & 0x3ff, 0))
-	{
-	  /* sethi	%hh(value), %g1
-	     sethi	%lm(value), %g5
-	     or		%g1, %hm(value), %g1
-	     or		%g5, %lo(value), %g5
-	     sllx	%g1, 32, %g1
-	     jmpl	%g1 + %g5, %g0
-	      nop  */
-
-	  insns[5] = 0x81c04005;
-	  __asm __volatile ("flush %0 + 20" : : "r" (insns));
-
-	  insns[4] = 0x83287020;
-	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
-
-	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
-	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
-
-	  insns[2] = 0x82106000 | (high32 & 0x3ff);
-	}
-      else
-	{
-	  /* sethi	%hh(value), %g1
-	     sethi	%lm(value), %g5
-	     sllx	%g1, 32, %g1
-	     or		%g5, %lo(value), %g5
-	     jmpl	%g1 + %g5, %g0
-	      nop  */
-
-	  insns[4] = 0x81c04005;
-	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
-
-	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
-	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
-
-	  insns[2] = 0x83287020;
-	}
-
-      __asm __volatile ("flush %0 + 8" : : "r" (insns));
-
-      insns[1] = 0x0b000000 | (low32 >> 10);
-      __asm __volatile ("flush %0 + 4" : : "r" (insns));
-
-      insns[0] = 0x03000000 | (high32 >> 10);
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-}
-
 static inline Elf64_Addr __attribute__ ((always_inline))
 elf_machine_fixup_plt (struct link_map *map, lookup_t t,
 		       const Elf64_Rela *reloc,
@@ -549,6 +424,11 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
 
   value += reloc->r_addend;	/* Assume copy relocs have zero addend.  */
 
+  if (sym != NULL
+      && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+      && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+    value = ((Elf64_Addr (*) (void)) value) ();
+
   switch (r_type)
     {
 #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP
@@ -576,6 +456,13 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
     case R_SPARC_GLOB_DAT:
       *reloc_addr = value;
       break;
+    case R_SPARC_IRELATIVE:
+      value = ((Elf64_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+      break;
+    case R_SPARC_JMP_IREL:
+      value = ((Elf64_Addr (*) (void)) value) ();
+      /* Fall thru */
     case R_SPARC_JMP_SLOT:
 #ifdef RESOLVE_CONFLICT_FIND_MAP
       /* R_SPARC_JMP_SLOT conflicts against .plt[32768+]
@@ -757,16 +644,29 @@ __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map,
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc)
 {
-  switch (ELF64_R_TYPE (reloc->r_info))
+  Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+
+  if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1))
+    ;
+  else if (r_type == R_SPARC_JMP_IREL
+	   || r_type == R_SPARC_IRELATIVE)
     {
-    case R_SPARC_NONE:
-      break;
-    case R_SPARC_JMP_SLOT:
-      break;
-    default:
-      _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1);
-      break;
+      Elf64_Addr value = map->l_addr + reloc->r_addend;
+      value = ((Elf64_Addr (*) (void)) value) ();
+      if (r_type == R_SPARC_JMP_IREL)
+	{
+	  /* 'high' is always zero, for large PLT entries the linker
+	     emits an R_SPARC_IRELATIVE.  */
+	  sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
+	}
+      else
+	*reloc_addr = value;
     }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
 }
 
 #endif	/* RESOLVE_MAP */
diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h
new file mode 100644
index 0000000000..e06be43a0a
--- /dev/null
+++ b/sysdeps/sparc/sparc64/dl-plt.h
@@ -0,0 +1,144 @@
+/* PLT fixups.  Sparc 64-bit version.
+   Copyright (C) 1997-2006, 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* We have 4 cases to handle.  And we code different code sequences
+   for each one.  I love V9 code models...  */
+static inline void __attribute__ ((always_inline))
+sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
+		   Elf64_Addr *reloc_addr, Elf64_Addr value,
+		   Elf64_Addr high, int t)
+{
+  unsigned int *insns = (unsigned int *) reloc_addr;
+  Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
+  Elf64_Sxword disp = value - plt_vaddr;
+
+  /* Now move plt_vaddr up to the call instruction.  */
+  plt_vaddr += ((t + 1) * 4);
+
+  /* PLT entries .PLT32768 and above look always the same.  */
+  if (__builtin_expect (high, 0) != 0)
+    {
+      *reloc_addr = value - map->l_addr;
+    }
+  /* Near destination.  */
+  else if (disp >= -0x800000 && disp < 0x800000)
+    {
+      /* As this is just one instruction, it is thread safe and so
+	 we can avoid the unnecessary sethi FOO, %g1.
+	 b,a target  */
+      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+  /* 32-bit Sparc style, the target is in the lower 32-bits of
+     address space.  */
+  else if (insns += t, (value >> 32) == 0)
+    {
+      /* sethi	%hi(target), %g1
+	 jmpl	%g1 + %lo(target), %g0  */
+
+      insns[1] = 0x81c06000 | (value & 0x3ff);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x03000000 | ((unsigned int)(value >> 10));
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+  /* We can also get somewhat simple sequences if the distance between
+     the target and the PLT entry is within +/- 2GB.  */
+  else if ((plt_vaddr > value
+	    && ((plt_vaddr - value) >> 31) == 0)
+	   || (value > plt_vaddr
+	       && ((value - plt_vaddr) >> 31) == 0))
+    {
+      unsigned int displacement;
+
+      if (plt_vaddr > value)
+	displacement = (0 - (plt_vaddr - value));
+      else
+	displacement = value - plt_vaddr;
+
+      /* mov	%o7, %g1
+	 call	displacement
+	  mov	%g1, %o7  */
+
+      insns[2] = 0x9e100001;
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x40000000 | (displacement >> 2);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x8210000f;
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+  /* Worst case, ho hum...  */
+  else
+    {
+      unsigned int high32 = (value >> 32);
+      unsigned int low32 = (unsigned int) value;
+
+      /* ??? Some tricks can be stolen from the sparc64 egcs backend
+	     constant formation code I wrote.  -DaveM  */
+
+      if (__builtin_expect (high32 & 0x3ff, 0))
+	{
+	  /* sethi	%hh(value), %g1
+	     sethi	%lm(value), %g5
+	     or		%g1, %hm(value), %g1
+	     or		%g5, %lo(value), %g5
+	     sllx	%g1, 32, %g1
+	     jmpl	%g1 + %g5, %g0
+	      nop  */
+
+	  insns[5] = 0x81c04005;
+	  __asm __volatile ("flush %0 + 20" : : "r" (insns));
+
+	  insns[4] = 0x83287020;
+	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
+
+	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
+	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+	  insns[2] = 0x82106000 | (high32 & 0x3ff);
+	}
+      else
+	{
+	  /* sethi	%hh(value), %g1
+	     sethi	%lm(value), %g5
+	     sllx	%g1, 32, %g1
+	     or		%g5, %lo(value), %g5
+	     jmpl	%g1 + %g5, %g0
+	      nop  */
+
+	  insns[4] = 0x81c04005;
+	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
+
+	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
+	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+	  insns[2] = 0x83287020;
+	}
+
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x0b000000 | (low32 >> 10);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x03000000 | (high32 >> 10);
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+}
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S
index 4cb968505d..cfd9864f63 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S
@@ -28,19 +28,12 @@
 #define __longjmp ____longjmp_chk
 
 #ifdef PIC
-# ifdef HAVE_ASM_PPC_REL16
 #  define LOAD_ARG \
 	bcl	20,31,1f;				\
 1:	mflr	r3;					\
 	addis	r3,r3,_GLOBAL_OFFSET_TABLE_-1b@ha;	\
 	addi	r3,r3,_GLOBAL_OFFSET_TABLE_-1b@l;	\
 	lwz	r3,.LC0@got(r3)
-# else
-#  define LOAD_ARG \
-	bl	_GLOBAL_OFFSET_TABLE_-4@local;		\
-	mflr	r3;					\
-	lwz	r3,.LC0@got(r3)
-# endif
 #else
 # define LOAD_ARG \
 	lis	r3,.LC0@ha;				\
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S
index e945834945..4c8c6b433b 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S
@@ -36,17 +36,10 @@ ENTRY (BP_SYM (__brk))
 	DO_CALL(SYS_ify(brk))
 	lwz     r6,8(r1)
 #ifdef PIC
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r5
 	addis	r5,r5,__curbrk-1b@ha
 	stw	r3,__curbrk-1b@l(r5)
-# else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r5
-	lwz     r5,__curbrk@got(r5)
-	stw     r3,0(r5)
-# endif
 #else
 	lis     r4,__curbrk@ha
 	stw     r3,__curbrk@l(r4)
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
index 63e1773e22..27285ed4a5 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
@@ -145,15 +145,10 @@ ENTRY(__CONTEXT_FUNC_NAME)
 # ifdef __CONTEXT_ENABLE_VRS
 #  ifdef PIC
 	mflr    r8
-#   ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r7
 	addis	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l
-#   else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r7
-#   endif
 #   ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
index d379a2dd12..af946119aa 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
@@ -1,3 +1,4 @@
 # Make sure this comes before the powerpc/powerpc32/fpu that's
 # listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
+powerpc/powerpc32/power7/fpu
 powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
index 127c9e4581..f304090868 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
@@ -73,15 +73,10 @@ ENTRY(__CONTEXT_FUNC_NAME)
 
 #ifdef PIC
 	mflr    r8
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r7
 	addis	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l
-# else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r7
-# endif
 # ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
index 89b1a61954..62efee2dce 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
@@ -146,15 +146,10 @@ ENTRY(__CONTEXT_FUNC_NAME)
 
 # ifdef PIC
 	mflr    r8
-#  ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r7
 	addis	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l
-#  else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r7
-#  endif
 #  ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
index c46b3d42af..ca112208d1 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
@@ -1,3 +1,4 @@
 # Make sure this comes before the powerpc/powerpc64/fpu that's
 # listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies.
+powerpc/powerpc64/power7/fpu
 powerpc/powerpc64/power5/fpu
diff --git a/sysdeps/x86_64/fpu/fegetenv.c b/sysdeps/x86_64/fpu/fegetenv.c
index fa5a8dadcb..2159a1fab1 100644
--- a/sysdeps/x86_64/fpu/fegetenv.c
+++ b/sysdeps/x86_64/fpu/fegetenv.c
@@ -28,3 +28,4 @@ fegetenv (fenv_t *envp)
   /* Success.  */
   return 0;
 }
+libm_hidden_def (fegetenv)