From 6cbbaa50aac809ad6e0692247876c82d58e466bf Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 2 Jul 2009 03:30:55 -0700 Subject: [PATCH 01/50] Fix possible race when freeing object in fast bin list. --- ChangeLog | 5 +++++ malloc/malloc.c | 1 + 2 files changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0c6c36f472..4700e7d7b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-02 Ulrich Drepper + + * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Add full barrier when + adding to fast bin list. + 2009-07-01 Ulrich Drepper * nis/nss_nis/nis-network.c (_nss_nis_getnetbyaddr_r): Don't use diff --git a/malloc/malloc.c b/malloc/malloc.c index 516d401991..70e4e58845 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -4822,6 +4822,7 @@ _int_free(mstate av, mchunkptr p) goto errout; } p->fd = fd = old; + atomic_full_barrier (); } while ((old = catomic_compare_and_exchange_val_acq (fb, p, fd)) != fd); #else From ab6a873fe07b8ded403bc5a5ca73be5d04820d61 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 2 Jul 2009 03:39:03 -0700 Subject: [PATCH 02/50] SSSE3 strcpy/stpcpy for x86-64 This patch adds SSSE3 strcpy/stpcpy. I got up to 4X speed up on Core 2 and Core i7. I disabled it on Atom since SSSE3 version is slower for shorter (<64byte) data. --- ChangeLog | 19 + string/stpncpy.c | 15 +- string/strncpy.c | 9 +- sysdeps/x86_64/multiarch/Makefile | 2 +- sysdeps/x86_64/multiarch/stpcpy.S | 7 + sysdeps/x86_64/multiarch/stpncpy-c.c | 8 + sysdeps/x86_64/multiarch/stpncpy.S | 6 + sysdeps/x86_64/multiarch/strcpy.S | 1917 ++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/strncpy-c.c | 8 + sysdeps/x86_64/multiarch/strncpy.S | 3 + 10 files changed, 1982 insertions(+), 12 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/stpcpy.S create mode 100644 sysdeps/x86_64/multiarch/stpncpy-c.c create mode 100644 sysdeps/x86_64/multiarch/stpncpy.S create mode 100644 sysdeps/x86_64/multiarch/strcpy.S create mode 100644 sysdeps/x86_64/multiarch/strncpy-c.c create mode 100644 sysdeps/x86_64/multiarch/strncpy.S diff --git a/ChangeLog b/ChangeLog index 4700e7d7b3..b3c403dc16 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +2009-06-30 H.J. Lu + + * string/stpncpy.c (STPNCPY): New. Defined if not defined. + (__stpncpy): Renamed to ... + (STPNCPY): This. + (stpncpy): Create alias only if STPNCPY is not defined. + * string/strncpy.c (STRNCPY): New. Defined to strncpy if not + defined. + (strncpy): Renamed to ... + (STRNCPY): This. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + stpncpy-c strncpy-c for string. + * sysdeps/x86_64/multiarch/stpcpy.S: New file. + * sysdeps/x86_64/multiarch/stpncpy-c.c: New file. + * sysdeps/x86_64/multiarch/stpncpy.S: New file. + * sysdeps/x86_64/multiarch/strcpy.S: New file. + * sysdeps/x86_64/multiarch/strncpy-c.c: New file. + * sysdeps/x86_64/multiarch/strncpy.S: New file. + 2009-07-02 Ulrich Drepper * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Add full barrier when diff --git a/string/stpncpy.c b/string/stpncpy.c index 164d0f1747..2ebab33d8a 100644 --- a/string/stpncpy.c +++ b/string/stpncpy.c @@ -28,17 +28,19 @@ # include #endif -#ifndef weak_alias -# define __stpncpy stpncpy +#ifndef STPNCPY +# ifdef weak_alias +# define STPNCPY __stpncpy +weak_alias (__stpncpy, stpncpy) +# else +# define STPNCPY stpncpy +# endif #endif /* Copy no more than N characters of SRC to DEST, returning the address of the terminating '\0' in DEST, if any, or else DEST + N. */ char * -__stpncpy (dest, src, n) - char *dest; - const char *src; - size_t n; +STPNCPY (char *dest, const char *src, size_t n) { char c; char *s = dest; @@ -96,5 +98,4 @@ __stpncpy (dest, src, n) } #ifdef weak_alias libc_hidden_def (__stpncpy) -weak_alias (__stpncpy, stpncpy) #endif diff --git a/string/strncpy.c b/string/strncpy.c index f32612e1cf..2274d7d31e 100644 --- a/string/strncpy.c +++ b/string/strncpy.c @@ -21,11 +21,12 @@ #undef strncpy +#ifndef STRNCPY +#define STRNCPY strncpy +#endif + char * -strncpy (s1, s2, n) - char *s1; - const char *s2; - size_t n; +STRNCPY (char *s1, const char *s2, size_t n) { reg_char c; char *s = s1; diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 1c35e1ffb4..127592aa3a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,5 +4,5 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncmp-c +sysdep_routines += stpncpy-c strncpy-c strncmp-c endif diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..b63d308edc --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..ff89a89491 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..bbc9979e0c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,1917 @@ +/* strcpy with SSSE3 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define __GI_STRCPY __GI_stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCPY_SSE2(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 3f +/* Avoid SSSE3 strcpy on Atom since it is slow. */ + cmpl $1, __cpu_features+KIND_OFFSET(%rip) + jne 2f + cmpl $6, __cpu_features+FAMILY_OFFSET(%rip) + jne 2f + cmpl $28, __cpu_features+MODEL_OFFSET(%rip) + jz 3f +2: leaq STRCPY_SSSE3(%rip), %rax +3: ret +END(STRCPY) + + .section .text.ssse3,"ax",@progbits +STRCPY_SSSE3: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to copy up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCPY + test %rdx, %rdx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 +#else + xor %edx, %edx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ + and $15, %ecx + mov %rdi, %rax /*store return parameter*/ + + + pxor %xmm0, %xmm0 /* clear %xmm0 */ + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + shr %cl, %edx /* get real bits left in edx*/ + test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ + jnz LABEL(less16bytes) + +#ifdef USE_AS_STRNCPY + lea -16(%r8,%rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ +#endif + + mov %rcx, %r9 + or %edi, %ecx + and $15, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ + + neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ + pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) + /* + * at least 16 byte available to fill destination rdi + */ +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destatination rdi may be aligned by 16, re-calculate rsi to jump + * crossponding case + * rcx is offset of rsi + * rax is offset of rdi + */ + + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax store orignal rdi */ + xor %rdi, %rdx /* equal to and $15, %rdx */ +#ifdef USE_AS_STRNCPY + add %rdx, %r8 +#endif + + add $16, %rdi /* next 16 bytes for rdi */ + sub %rdx, %r9 + + lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ + mov %esi, %ecx /*store offset of rsi */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ + jz LABEL(ashr_0) + + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + + /* + * The following cases will be handled by ashr_0 & ashr_0_start + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * 0 0 0 ashr_0 + * n(1~15) n(1~15) 0 ashr_0_start + * + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ + movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + + test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ + jnz LABEL(aligned_16bytes) + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + + jmp LABEL(aligned_exit) + .p2align 4 + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_15_use_ssse3) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_ssse3) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_9): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_9_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_ssse3) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_8): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_8_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_ssse3) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_7): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + .p2align 4 + +LABEL(ashr_7_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_ssse3) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_6): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_6_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_ssse3) + + /* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_5): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_5_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_ssse3) + +/* + * + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_4): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_4_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_ssse3) + +/* + * + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_3): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_3_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_3_use_ssse3) + +/* + * + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_2): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_2_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_ssse3) + +/* + * + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_1): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_1_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_ssse3) + + .p2align 4 +LABEL(less32bytes): + xor %ecx, %ecx +LABEL(unaligned_exit): + add %r9, %rsi /* r9 stores original offset of rsi*/ + mov %rcx, %r9 + mov %r10, %rcx + shl %cl, %edx /* after shl, calculate the exact number to be filled*/ + mov %r9, %rcx + .p2align 4 +LABEL(aligned_exit): + add %rcx, %rdi /*locate exact address for rdi */ +LABEL(less16bytes): + add %rcx, %rsi /*locate exact address for rsi */ +LABEL(aligned_16bytes): +#ifdef USE_AS_STRNCPY + mov $1, %r9d + lea -1(%r8), %rcx + shl %cl, %r9d + cmp $32, %r8 + ja LABEL(strncpy_tail) + or %r9d, %edx +LABEL(strncpy_tail): +#endif + bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(less32bytes_strncpy_truncation): + xor %ecx, %ecx +LABEL(strncpy_truncation_unaligned): + add %r9, %rsi +LABEL(strncpy_truncation_aligned): + add %rcx, %rdi + add %rcx, %rsi + add $16, %r8 + lea -1(%r8), %rcx + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + .p2align 4 +LABEL(strncpy_exitz): + mov %rdi, %rax + ret +#endif + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(strncpy_fill_tail): + mov %rax, %rdx + movzx %cl, %rax + mov %r8, %rcx + add %rax, %rdi + xor %eax, %eax + shr $3, %ecx + jz LABEL(strncpy_fill_less_8) + + rep stosq +LABEL(strncpy_fill_less_8): + mov %r8, %rcx + and $7, %ecx + jz LABEL(strncpy_fill_return) +LABEL(strncpy_fill_less_7): + sub $1, %ecx + mov %al, (%rdi, %rcx) + jnz LABEL(strncpy_fill_less_7) +LABEL(strncpy_fill_return): +#ifdef USE_AS_STPCPY + cmpb $1, (%rdx) + sbb $-1, %rdx +#endif + mov %rdx, %rax + ret +#endif + .p2align 4 +LABEL(tail_0): + mov (%rsi), %cl + mov %cl, (%rdi) +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRNCPY + mov $1, %cl + sub $1, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_1): + mov (%rsi), %cx + mov %cx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $2, %cl + sub $2, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_2): + mov (%rsi), %cx + mov %cx, (%rdi) + mov 1(%rsi), %cx + mov %cx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $3, %cl + sub $3, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_3): + mov (%rsi), %ecx + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $4, %cl + sub $4, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_4): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 1(%rsi), %edx + mov %edx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $5, %cl + sub $5, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_5): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 2(%rsi), %edx + mov %edx, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $6, %cl + sub $6, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_6): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 3(%rsi), %edx + mov %edx,3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $7, %cl + sub $7, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_7): + mov (%rsi), %rcx + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $8, %cl + sub $8, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_8): + + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %edx + mov %edx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $9, %cl + sub $9, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_9): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %edx + mov %edx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $10, %cl + sub $10, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_10): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %edx + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $11, %cl + sub $11, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_11): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %edx + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $12, %cl + sub $12, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_12): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %rcx + mov %rcx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $13, %cl + sub $13, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_13): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %rcx + mov %rcx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $14, %cl + sub $14, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_14): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %rcx + mov %rcx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $15, %cl + sub $15, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + +LABEL(tail_15): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $16, %cl + sub $16, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_16): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cl + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $17, %cl + sub $17, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_17): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cx + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $18, %cl + sub $18, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_18): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %ecx + mov %ecx,15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $19, %cl + sub $19, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_19): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %ecx + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $20, %cl + sub $20, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_20): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 13(%rsi), %rcx + mov %rcx, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $21, %cl + sub $21, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_21): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 14(%rsi), %rcx + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $22, %cl + sub $22, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_22): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %rcx + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $23, %cl + sub $23, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_23): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $24, %cl + sub $24, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_24): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %edx + mov %edx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $25, %cl + sub $25, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_25): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %edx + mov %edx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $26, %cl + sub $26, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_26): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %edx + mov %edx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $27, %cl + sub $27, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_27): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %edx + mov %edx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $28, %cl + sub $28, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_28): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %rdx + mov %rdx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $29, %cl + sub $29, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_29): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %rdx + mov %rdx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $30, %cl + sub $30, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + + .p2align 4 +LABEL(tail_30): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %rdx + mov %rdx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $31, %cl + sub $31, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_31): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %rdx + mov %rdx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $32, %cl + sub $32, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + cfi_endproc + .size STRCPY_SSSE3, .-STRCPY_SSSE3 + + .p2align 4 + .section .rodata.ssse3,"a",@progbits +LABEL(tail_table): + .int LABEL(tail_0) - LABEL(tail_table) + .int LABEL(tail_1) - LABEL(tail_table) + .int LABEL(tail_2) - LABEL(tail_table) + .int LABEL(tail_3) - LABEL(tail_table) + .int LABEL(tail_4) - LABEL(tail_table) + .int LABEL(tail_5) - LABEL(tail_table) + .int LABEL(tail_6) - LABEL(tail_table) + .int LABEL(tail_7) - LABEL(tail_table) + .int LABEL(tail_8) - LABEL(tail_table) + .int LABEL(tail_9) - LABEL(tail_table) + .int LABEL(tail_10) - LABEL(tail_table) + .int LABEL(tail_11) - LABEL(tail_table) + .int LABEL(tail_12) - LABEL(tail_table) + .int LABEL(tail_13) - LABEL(tail_table) + .int LABEL(tail_14) - LABEL(tail_table) + .int LABEL(tail_15) - LABEL(tail_table) + .int LABEL(tail_16) - LABEL(tail_table) + .int LABEL(tail_17) - LABEL(tail_table) + .int LABEL(tail_18) - LABEL(tail_table) + .int LABEL(tail_19) - LABEL(tail_table) + .int LABEL(tail_20) - LABEL(tail_table) + .int LABEL(tail_21) - LABEL(tail_table) + .int LABEL(tail_22) - LABEL(tail_table) + .int LABEL(tail_23) - LABEL(tail_table) + .int LABEL(tail_24) - LABEL(tail_table) + .int LABEL(tail_25) - LABEL(tail_table) + .int LABEL(tail_26) - LABEL(tail_table) + .int LABEL(tail_27) - LABEL(tail_table) + .int LABEL(tail_28) - LABEL(tail_table) + .int LABEL(tail_29) - LABEL(tail_table) + .int LABEL(tail_30) - LABEL(tail_table) + .int LABEL(tail_31) - LABEL(tail_table) + + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 0000000000..296c32cb5d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 0000000000..327a4ce447 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,3 @@ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" From af263b81541d1f4a10fc0862d0f3e3b9464534c1 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 2 Jul 2009 03:43:05 -0700 Subject: [PATCH 03/50] Whitespace fixes in last patch. --- sysdeps/x86_64/multiarch/strcpy.S | 62 +++++++++++++++---------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index bbc9979e0c..9920b0ec74 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -178,7 +178,7 @@ STRCPY_SSSE3: LABEL(ashr_0): #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_aligned) + jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ @@ -266,7 +266,7 @@ LABEL(ashr_15_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $15, (%rsi, %rcx), %xmm3 @@ -285,7 +285,7 @@ LABEL(ashr_15_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $15, (%rsi, %rcx), %xmm3 @@ -322,7 +322,7 @@ LABEL(ashr_14_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $14, (%rsi, %rcx), %xmm3 @@ -341,7 +341,7 @@ LABEL(ashr_14_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $14, (%rsi, %rcx), %xmm3 @@ -378,7 +378,7 @@ LABEL(ashr_13_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $13, (%rsi, %rcx), %xmm3 @@ -397,7 +397,7 @@ LABEL(ashr_13_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $13, (%rsi, %rcx), %xmm3 @@ -434,7 +434,7 @@ LABEL(ashr_12_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $12, (%rsi, %rcx), %xmm3 @@ -453,7 +453,7 @@ LABEL(ashr_12_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $12, (%rsi, %rcx), %xmm3 @@ -490,7 +490,7 @@ LABEL(ashr_11_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $11, (%rsi, %rcx), %xmm3 @@ -509,7 +509,7 @@ LABEL(ashr_11_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $11, (%rsi, %rcx), %xmm3 @@ -546,7 +546,7 @@ LABEL(ashr_10_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $10, (%rsi, %rcx), %xmm3 @@ -565,7 +565,7 @@ LABEL(ashr_10_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $10, (%rsi, %rcx), %xmm3 @@ -602,7 +602,7 @@ LABEL(ashr_9_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $9, (%rsi, %rcx), %xmm3 @@ -621,7 +621,7 @@ LABEL(ashr_9_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $9, (%rsi, %rcx), %xmm3 @@ -658,7 +658,7 @@ LABEL(ashr_8_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $8, (%rsi, %rcx), %xmm3 @@ -677,7 +677,7 @@ LABEL(ashr_8_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $8, (%rsi, %rcx), %xmm3 @@ -714,7 +714,7 @@ LABEL(ashr_7_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $7, (%rsi, %rcx), %xmm3 @@ -733,7 +733,7 @@ LABEL(ashr_7_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $7, (%rsi, %rcx), %xmm3 @@ -770,7 +770,7 @@ LABEL(ashr_6_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $6, (%rsi, %rcx), %xmm3 @@ -789,7 +789,7 @@ LABEL(ashr_6_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $6, (%rsi, %rcx), %xmm3 @@ -826,7 +826,7 @@ LABEL(ashr_5_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $5, (%rsi, %rcx), %xmm3 @@ -845,7 +845,7 @@ LABEL(ashr_5_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $5, (%rsi, %rcx), %xmm3 @@ -883,7 +883,7 @@ LABEL(ashr_4_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $4, (%rsi, %rcx), %xmm3 @@ -902,7 +902,7 @@ LABEL(ashr_4_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $4, (%rsi, %rcx), %xmm3 @@ -940,7 +940,7 @@ LABEL(ashr_3_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $3, (%rsi, %rcx), %xmm3 @@ -959,7 +959,7 @@ LABEL(ashr_3_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $3, (%rsi, %rcx), %xmm3 @@ -997,7 +997,7 @@ LABEL(ashr_2_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $2, (%rsi, %rcx), %xmm3 @@ -1016,7 +1016,7 @@ LABEL(ashr_2_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $2, (%rsi, %rcx), %xmm3 @@ -1054,7 +1054,7 @@ LABEL(ashr_1_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $1, (%rsi, %rcx), %xmm3 @@ -1072,7 +1072,7 @@ LABEL(ashr_1_use_ssse3): jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) + jbe LABEL(strncpy_truncation_unaligned) #endif palignr $1, (%rsi, %rcx), %xmm3 movdqa %xmm3, (%rdi, %rcx) From 167d5ed5de0c6e587506b8a595fe0e4aa630bbb0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 2 Jul 2009 04:33:12 -0700 Subject: [PATCH 04/50] Fix handling of xmm6 in ld.so audit hooks on x86-64. --- ChangeLog | 13 +++ elf/Makefile | 11 ++- elf/tst-audit3.c | 20 +++++ elf/tst-auditmod3a.c | 24 +++++ elf/tst-auditmod3b.c | 156 +++++++++++++++++++++++++++++++++ sysdeps/x86_64/dl-trampoline.S | 6 +- 6 files changed, 227 insertions(+), 3 deletions(-) create mode 100644 elf/tst-audit3.c create mode 100644 elf/tst-auditmod3a.c create mode 100644 elf/tst-auditmod3b.c diff --git a/ChangeLog b/ChangeLog index b3c403dc16..795d07ec48 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,18 @@ 2009-06-30 H.J. Lu + * elf/Makefile (distribute): Remove tst-audit.sh. Add + tst-audit2.c, tst-audit3.c, tst-auditmod3a.c, tst-auditmod3b.c. + (tests): Add tst-audit3 for x86_64. + (modules-names): Add tst-auditmod3a, tst-auditmod3b. + ($(objpfx)tst-audit3): Define. + ($(objpfx)tst-audit3.out): Define. + (tst-audit3-ENV): Define. + * elf/tst-audit3.c: New file. + * elf/tst-auditmod3a.c: New file. + * elf/tst-auditmod3b.c: New file. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Save + and restore xmm6. + * string/stpncpy.c (STPNCPY): New. Defined if not defined. (__stpncpy): Renamed to ... (STPNCPY): This. diff --git a/elf/Makefile b/elf/Makefile index 56935d5a1a..57febea483 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -89,7 +89,8 @@ distribute := rtld-Rules \ unload4mod1.c unload4mod2.c unload4mod3.c unload4mod4.c \ unload6mod1.c unload6mod2.c unload6mod3.c \ unload7mod1.c unload7mod2.c \ - tst-auditmod1.c tst-audit.sh \ + tst-audit1.c tst-audit2.c tst-audit3.c \ + tst-auditmod1.c tst-auditmod3a.c tst-auditmod3b.c \ order2mod1.c order2mod2.c order2mod3.c order2mod4.c \ tst-stackguard1.c tst-stackguard1-static.c \ tst-array5.c tst-array5-static.c tst-array5dep.c \ @@ -193,6 +194,9 @@ tests += loadtest restest1 preloadtest loadfail multiload origtest resolvfail \ # reldep9 test-srcs = tst-pathopt tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog +ifeq (x86_64,$(config-machine)) +tests += tst-audit3 +endif endif ifeq (yesyes,$(have-fpie)$(build-shared)) tests: $(objpfx)tst-pie1.out @@ -230,6 +234,7 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \ $(modules-execstack-$(have-z-execstack)) \ tst-dlopenrpathmod tst-deep1mod1 tst-deep1mod2 tst-deep1mod3 \ tst-dlmopen1mod tst-auditmod1 \ + tst-auditmod3a tst-auditmod3b \ unload3mod1 unload3mod2 unload3mod3 unload3mod4 \ unload4mod1 unload4mod2 unload4mod3 unload4mod4 \ unload6mod1 unload6mod2 unload6mod3 \ @@ -959,6 +964,10 @@ tst-audit1-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so $(objpfx)tst-audit2.out: $(objpfx)tst-auditmod1.so tst-audit2-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so +$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so +$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so +tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so + $(objpfx)tst-global1: $(libdl) $(objpfx)tst-global1.out: $(objpfx)testobj6.so $(objpfx)testobj2.so diff --git a/elf/tst-audit3.c b/elf/tst-audit3.c new file mode 100644 index 0000000000..ae86cc6b7d --- /dev/null +++ b/elf/tst-audit3.c @@ -0,0 +1,20 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include +#include + +#include + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); +int +main (void) +{ + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + + return 0; +} diff --git a/elf/tst-auditmod3a.c b/elf/tst-auditmod3a.c new file mode 100644 index 0000000000..6019589e24 --- /dev/null +++ b/elf/tst-auditmod3a.c @@ -0,0 +1,24 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include +#include +#include + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm = _mm_setzero_si128 (); + + if (memcmp (&xmm, &x0, sizeof (xmm)) + || memcmp (&xmm, &x1, sizeof (xmm)) + || memcmp (&xmm, &x2, sizeof (xmm)) + || memcmp (&xmm, &x3, sizeof (xmm)) + || memcmp (&xmm, &x4, sizeof (xmm)) + || memcmp (&xmm, &x5, sizeof (xmm)) + || memcmp (&xmm, &x6, sizeof (xmm)) + || memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return xmm; +} diff --git a/elf/tst-auditmod3b.c b/elf/tst-auditmod3b.c new file mode 100644 index 0000000000..388ed6e49c --- /dev/null +++ b/elf/tst-auditmod3b.c @@ -0,0 +1,156 @@ +/* Verify that changing xmm registers in audit library won't affect + function parameter passing/return. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#define pltenter la_x86_64_gnu_pltenter +#define pltexit la_x86_64_gnu_pltexit +#define La_regs La_x86_64_regs +#define La_retval La_x86_64_retval +#define int_retval lrv_rax + +#include + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" ); + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, outregs->int_retval); + + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" ); + + return 0; +} diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index d8d9bc12a4..33e6115f7b 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -107,7 +107,8 @@ _dl_runtime_profile: movaps %xmm3, 112(%rsp) movaps %xmm4, 128(%rsp) movaps %xmm5, 144(%rsp) - movaps %xmm7, 160(%rsp) + movaps %xmm6, 160(%rsp) + movaps %xmm7, 176(%rsp) movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. movq 48(%rbx), %rdx # Load return address if needed. @@ -128,7 +129,8 @@ _dl_runtime_profile: movaps 112(%rsp), %xmm3 movaps 128(%rsp), %xmm4 movaps 144(%rsp), %xmm5 - movaps 160(%rsp), %xmm7 + movaps 160(%rsp), %xmm6 + movaps 176(%rsp), %xmm7 movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 From 241e68032077f92de17f69ac77161807c232b346 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 2 Jul 2009 04:34:35 -0700 Subject: [PATCH 05/50] Fix whitespace in last patch. --- elf/tst-audit3.c | 2 +- elf/tst-auditmod3a.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/elf/tst-audit3.c b/elf/tst-audit3.c index ae86cc6b7d..d00db9972b 100644 --- a/elf/tst-audit3.c +++ b/elf/tst-audit3.c @@ -12,7 +12,7 @@ main (void) { __m128i xmm = _mm_setzero_si128 (); __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); - + if (memcmp (&xmm, &ret, sizeof (ret))) abort (); diff --git a/elf/tst-auditmod3a.c b/elf/tst-auditmod3a.c index 6019589e24..9514aba505 100644 --- a/elf/tst-auditmod3a.c +++ b/elf/tst-auditmod3a.c @@ -9,7 +9,7 @@ audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, __m128i x4, __m128i x5, __m128i x6, __m128i x7) { __m128i xmm = _mm_setzero_si128 (); - + if (memcmp (&xmm, &x0, sizeof (xmm)) || memcmp (&xmm, &x1, sizeof (xmm)) || memcmp (&xmm, &x2, sizeof (xmm)) From 06e51c8f3de38761f8855700841bc49cf495c8c0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 3 Jul 2009 02:48:56 -0700 Subject: [PATCH 06/50] Add SSE4.2 support for strcspn, strpbrk, and strspn on x86-64. --- ChangeLog | 16 ++ config.h.in | 3 + config.make.in | 2 + configure | 2 + configure.in | 1 + sysdeps/i386/configure | 71 +++++- sysdeps/i386/configure.in | 11 + sysdeps/x86_64/multiarch/Makefile | 6 + sysdeps/x86_64/multiarch/strcspn-c.c | 331 +++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/strcspn.S | 82 +++++++ sysdeps/x86_64/multiarch/strpbrk-c.c | 4 + sysdeps/x86_64/multiarch/strpbrk.S | 3 + sysdeps/x86_64/multiarch/strspn-c.c | 287 +++++++++++++++++++++++ sysdeps/x86_64/multiarch/strspn.S | 63 +++++ 14 files changed, 875 insertions(+), 7 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/strcspn-c.c create mode 100644 sysdeps/x86_64/multiarch/strcspn.S create mode 100644 sysdeps/x86_64/multiarch/strpbrk-c.c create mode 100644 sysdeps/x86_64/multiarch/strpbrk.S create mode 100644 sysdeps/x86_64/multiarch/strspn-c.c create mode 100644 sysdeps/x86_64/multiarch/strspn.S diff --git a/ChangeLog b/ChangeLog index 795d07ec48..53fcf2a1f8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2009-07-02 H.J. Lu + + * config.h.in (HAVE_SSE4_SUPPORT): New macro. + * config.make.in (config-cflags-sse4): New variable. + * configure.in: Substitute libc_cv_cc_sse4. + * sysdeps/i386/configure.in: Set libc_cv_cc_sse4 and + HAVE_SSE4_SUPPORT. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strcspn-c, strpbrk-c, strspn-c for string if gcc supports SSE4. + * sysdeps/x86_64/multiarch/strcspn-c.c: New file. + * sysdeps/x86_64/multiarch/strcspn.S: New file. + * sysdeps/x86_64/multiarch/strpbrk-c.c: New file. + * sysdeps/x86_64/multiarch/strpbrk.S: New file. + * sysdeps/x86_64/multiarch/strspn-c.c: New file. + * sysdeps/x86_64/multiarch/strspn.S: New file. + 2009-06-30 H.J. Lu * elf/Makefile (distribute): Remove tst-audit.sh. Add diff --git a/config.h.in b/config.h.in index 8dbc224a7d..4ddab7d775 100644 --- a/config.h.in +++ b/config.h.in @@ -129,6 +129,9 @@ /* Define if binutils support TLS handling. */ #undef HAVE_TLS_SUPPORT +/* Define if gcc supports SSE4. */ +#undef HAVE_SSE4_SUPPORT + /* Define if the compiler's exception support is based on libunwind. */ #undef HAVE_CC_WITH_LIBUNWIND diff --git a/config.make.in b/config.make.in index e48ea2658d..5fb5c8110c 100644 --- a/config.make.in +++ b/config.make.in @@ -34,6 +34,8 @@ config-sysdirs = @sysnames@ cflags-cpu = @libc_cv_cc_submachine@ asflags-cpu = @libc_cv_cc_submachine@ +config-cflags-sse4 = @libc_cv_cc_sse4@ + defines = @DEFINES@ sysincludes = @SYSINCLUDES@ c++-sysincludes = @CXX_SYSINCLUDES@ diff --git a/configure b/configure index 88cf4fd853..e30778fd94 100755 --- a/configure +++ b/configure @@ -657,6 +657,7 @@ xcoff elf ldd_rewrite_script use_ldconfig +libc_cv_cc_sse4 libc_cv_cpp_asm_debuginfo libc_cv_forced_unwind libc_cv_rootsbindir @@ -8744,6 +8745,7 @@ fi + if test $elf = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_ELF 1 diff --git a/configure.in b/configure.in index 6a92bd876a..216cdc9d07 100644 --- a/configure.in +++ b/configure.in @@ -2259,6 +2259,7 @@ AC_SUBST(libc_cv_forced_unwind) dnl sysdeps/CPU/configure.in checks set this via arch-specific asm tests AC_SUBST(libc_cv_cpp_asm_debuginfo) +AC_SUBST(libc_cv_cc_sse4) AC_SUBST(use_ldconfig) AC_SUBST(ldd_rewrite_script) diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure index d1d4dc15a7..cbc8cd9206 100755 --- a/sysdeps/i386/configure +++ b/sysdeps/i386/configure @@ -1,10 +1,42 @@ +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/i386. -echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5 -echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6 +{ $as_echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5 +$as_echo_n "checking if -g produces usable source locations for assembler-with-cpp... " >&6; } if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then - echo $ECHO_N "(cached) $ECHO_C" >&6 + $as_echo_n "(cached) " >&6 else cat > conftest.S <&5 (eval $ac_try) 2>&5 ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_pattern='conftest\.S' { ac_try='readelf --debug-dump=line conftest.o | @@ -35,7 +67,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } }; then libc_cv_cpp_asm_debuginfo=yes @@ -44,11 +76,36 @@ else fi rm -f conftest* fi -echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5 -echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6 +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5 +$as_echo "$libc_cv_cpp_asm_debuginfo" >&6; } if test $libc_cv_cpp_asm_debuginfo = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_CPP_ASM_DEBUGINFO 1 _ACEOF fi + +{ $as_echo "$as_me:$LINENO: checking for SSE4 support" >&5 +$as_echo_n "checking for SSE4 support... " >&6; } +if test "${libc_cv_cc_sse4+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5 +$as_echo "$libc_cv_cc_sse4" >&6; } +if test $libc_cv_cc_sse4 = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_SSE4_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in index 028e1ae8e1..44f53a57a0 100644 --- a/sysdeps/i386/configure.in +++ b/sysdeps/i386/configure.in @@ -33,3 +33,14 @@ rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo) if test $libc_cv_cpp_asm_debuginfo = yes; then AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO) fi + +dnl Check if -msse4 works. +AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl +if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi]) +if test $libc_cv_cc_sse4 = yes; then + AC_DEFINE(HAVE_SSE4_SUPPORT) +fi diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 127592aa3a..71e85f0652 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -5,4 +5,10 @@ endif ifeq ($(subdir),string) sysdep_routines += stpncpy-c strncpy-c strncmp-c +ifeq (yes,$(config-cflags-sse4)) +sysdep_routines += strcspn-c strpbrk-c strspn-c +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif endif diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 0000000000..6a8b87a866 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,331 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +#define STRCSPN_SSE2 __strcspn_sse2 +#define STRCSPN_SSE42 __strcspn_sse42 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + int offset; + const char *aligned; + __m128i mask, mask0, mask1; + __m128i value; + int index, length; + int cflag, zflag; + + if (*a == 0) +#ifdef USE_AS_STRPBRK + return NULL; +#else + return strlen (s); +#endif + + offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) +#ifdef USE_AS_STRPBRK + return (char *) (s + length); +#else + return length; +#endif + /* Find where the NULL terminator is. */ + index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) +#ifdef USE_AS_STRPBRK + return NULL; +#else + return index; +#endif + aligned += 16; + } + else + aligned = s; + +loop: + value = _mm_load_si128 ((__m128i *) aligned); + index = _mm_cmpistri (mask, value, 0x2); + cflag = _mm_cmpistrc (mask, value, 0x2); + zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) +#ifdef USE_AS_STRPBRK + return (char *) (aligned + index); +#else + return (size_t) (aligned + index - s); +#endif + if (zflag) +#ifdef USE_AS_STRPBRK + return NULL; +#else + { + /* Find where the NULL terminator is. */ + index = _mm_cmpistri (value, value, 0x3a); + return (size_t) (aligned + index - s); + } +#endif + aligned += 16; + goto loop; +} diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 0000000000..cc75ab70e6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,82 @@ +/* Multiple versions of strcspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_SSE4_SUPPORT + +#include +#include + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCSPN_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..c58dcb5605 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,4 @@ +#define USE_AS_STRPBRK +#define STRCSPN_SSE2 __strpbrk_sse2 +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c.c" diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 0000000000..ed5bca6a94 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,3 @@ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 0000000000..e48e4a7207 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,287 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + int offset; + const char *aligned; + __m128i mask, mask0, mask1; + __m128i value; + int index, length; + int cflag; + + if (*a == 0) + return 0; + + offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + +loop: + value = _mm_load_si128 ((__m128i *) aligned); + index = _mm_cmpistri (mask, value, 0x12); + cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + goto loop; +} diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 0000000000..4183a2cf60 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,63 @@ +/* Multiple versions of strspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_SSE4_SUPPORT + +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strspn_sse2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_sse2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#include "../strspn.S" From d6485c981b2b5aa4eb7cedc1ed1508183cb686f8 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 3 Jul 2009 03:01:57 -0700 Subject: [PATCH 07/50] Align functions to 16-byte boundary. Some of the new multi-arch string functions for x86-64 were not aligned to 16 byte boundarie,s possibly creating unnecessary cache line misses and delays. --- ChangeLog | 8 ++++++++ sysdeps/x86_64/multiarch/rawmemchr.S | 1 + sysdeps/x86_64/multiarch/strcmp.S | 1 + sysdeps/x86_64/multiarch/strcpy.S | 1 + sysdeps/x86_64/multiarch/strlen.S | 1 + 5 files changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 53fcf2a1f8..8b02c0cba5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2009-07-03 Ulrich Drepper + + * sysdeps/x86_64/multiarch/strcmp.S: Make sure functions are all + aligned to 16 byte boundaries. + * sysdeps/x86_64/multiarch/strcpy.S: Likewise. + * sysdeps/x86_64/multiarch/strlen.S: Likewise. + * sysdeps/x86_64/multiarch/rawmemchr.S: Likewise. + 2009-07-02 H.J. Lu * config.h.in (HAVE_SSE4_SUPPORT): New macro. diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S index 93ca631633..d4f265f430 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr.S +++ b/sysdeps/x86_64/multiarch/rawmemchr.S @@ -77,6 +77,7 @@ __rawmemchr_sse42: # undef ENTRY # define ENTRY(name) \ .type __rawmemchr_sse2, @function; \ + .align 16; \ __rawmemchr_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 2f4bf17d95..37985036aa 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -1659,6 +1659,7 @@ LABEL(unaligned_table): # undef ENTRY # define ENTRY(name) \ .type STRCMP_SSE2, @function; \ + .align 16; \ STRCMP_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index 9920b0ec74..25cd01307d 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -1896,6 +1896,7 @@ LABEL(unaligned_table): # undef ENTRY # define ENTRY(name) \ .type STRCPY_SSE2, @function; \ + .align 16; \ STRCPY_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 79e6a977ec..82b03ccc28 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -77,6 +77,7 @@ __strlen_sse42: # undef ENTRY # define ENTRY(name) \ .type __strlen_sse2, @function; \ + .align 16; \ __strlen_sse2: cfi_startproc; \ CALL_MCOUNT # undef END From cea43295928d46c3a951ac9d949197c83da7e217 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 3 Jul 2009 03:23:01 -0700 Subject: [PATCH 08/50] Minor cleanups in recently added files. --- ChangeLog | 3 + sysdeps/x86_64/multiarch/strcspn-c.c | 91 +++++++++++----------------- sysdeps/x86_64/multiarch/strspn-c.c | 45 +++++++------- 3 files changed, 60 insertions(+), 79 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8b02c0cba5..2e0549e10a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2009-07-03 Ulrich Drepper + * sysdeps/x86_64/multiarch/strcspn-c.c: Minor cleanups. + * sysdeps/x86_64/multiarch/strspn-c.c: Likewise. + * sysdeps/x86_64/multiarch/strcmp.S: Make sure functions are all aligned to 16 byte boundaries. * sysdeps/x86_64/multiarch/strcpy.S: Likewise. diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 6a8b87a866..4512267d3f 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -54,8 +54,14 @@ X for case 1. */ #ifndef STRCSPN_SSE2 -#define STRCSPN_SSE2 __strcspn_sse2 -#define STRCSPN_SSE42 __strcspn_sse42 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 #endif extern @@ -66,6 +72,7 @@ size_t #endif STRCSPN_SSE2 (const char *, const char *); + #ifdef USE_AS_STRPBRK char * #else @@ -74,26 +81,17 @@ size_t __attribute__ ((section (".text.sse4.2"))) STRCSPN_SSE42 (const char *s, const char *a) { - int offset; - const char *aligned; - __m128i mask, mask0, mask1; - __m128i value; - int index, length; - int cflag, zflag; - if (*a == 0) -#ifdef USE_AS_STRPBRK - return NULL; -#else - return strlen (s); -#endif + RETURN (NULL, strlen (s)); - offset = (int) ((size_t) a & 15); + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); - mask0 = _mm_load_si128 ((__m128i *) aligned); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); switch (offset) { @@ -145,12 +143,12 @@ STRCSPN_SSE42 (const char *s, const char *a) } /* Find where the NULL terminator is. */ - length = _mm_cmpistri (mask, mask, 0x3a); + int length = _mm_cmpistri (mask, mask, 0x3a); if (length == 16 - offset) { /* There is no NULL terminator. */ - mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - index = _mm_cmpistri (mask1, mask1, 0x3a); + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); length += index; /* Don't use SSE4.2 if the length of A > 16. */ @@ -217,7 +215,7 @@ STRCSPN_SSE42 (const char *s, const char *a) mask = _mm_load_si128 ((__m128i *) a); /* Find where the NULL terminator is. */ - length = _mm_cmpistri (mask, mask, 0x3a); + int length = _mm_cmpistri (mask, mask, 0x3a); if (length == 16) { /* There is no NULL terminator. Don't use SSE4.2 if the length @@ -232,7 +230,7 @@ STRCSPN_SSE42 (const char *s, const char *a) { /* Check partial string. */ aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); - value = _mm_load_si128 ((__m128i *) aligned); + __m128i value = _mm_load_si128 ((__m128i *) aligned); switch (offset) { @@ -283,49 +281,32 @@ STRCSPN_SSE42 (const char *s, const char *a) break; } - length = _mm_cmpistri (mask, value, 0x2); + int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ - cflag = _mm_cmpistrc (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); if (cflag) -#ifdef USE_AS_STRPBRK - return (char *) (s + length); -#else - return length; -#endif + RETURN ((char *) (s + length), length); /* Find where the NULL terminator is. */ - index = _mm_cmpistri (value, value, 0x3a); + int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) -#ifdef USE_AS_STRPBRK - return NULL; -#else - return index; -#endif + RETURN (NULL, index); aligned += 16; } else aligned = s; -loop: - value = _mm_load_si128 ((__m128i *) aligned); - index = _mm_cmpistri (mask, value, 0x2); - cflag = _mm_cmpistrc (mask, value, 0x2); - zflag = _mm_cmpistrz (mask, value, 0x2); - if (cflag) -#ifdef USE_AS_STRPBRK - return (char *) (aligned + index); -#else - return (size_t) (aligned + index - s); -#endif - if (zflag) -#ifdef USE_AS_STRPBRK - return NULL; -#else + while (1) { - /* Find where the NULL terminator is. */ - index = _mm_cmpistri (value, value, 0x3a); - return (size_t) (aligned + index - s); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; } -#endif - aligned += 16; - goto loop; } diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index e48e4a7207..5b99f0d383 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -54,26 +54,22 @@ extern size_t __strspn_sse2 (const char *, const char *); + size_t __attribute__ ((section (".text.sse4.2"))) __strspn_sse42 (const char *s, const char *a) { - int offset; - const char *aligned; - __m128i mask, mask0, mask1; - __m128i value; - int index, length; - int cflag; - if (*a == 0) return 0; - offset = (int) ((size_t) a & 15); + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); - mask0 = _mm_load_si128 ((__m128i *) aligned); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); switch (offset) { @@ -125,12 +121,12 @@ __strspn_sse42 (const char *s, const char *a) } /* Find where the NULL terminator is. */ - length = _mm_cmpistri (mask, mask, 0x3a); + int length = _mm_cmpistri (mask, mask, 0x3a); if (length == 16 - offset) { /* There is no NULL terminator. */ - mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - index = _mm_cmpistri (mask1, mask1, 0x3a); + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); length += index; /* Don't use SSE4.2 if the length of A > 16. */ @@ -197,7 +193,7 @@ __strspn_sse42 (const char *s, const char *a) mask = _mm_load_si128 ((__m128i *) a); /* Find where the NULL terminator is. */ - length = _mm_cmpistri (mask, mask, 0x3a); + int length = _mm_cmpistri (mask, mask, 0x3a); if (length == 16) { /* There is no NULL terminator. Don't use SSE4.2 if the length @@ -212,7 +208,7 @@ __strspn_sse42 (const char *s, const char *a) { /* Check partial string. */ aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); - value = _mm_load_si128 ((__m128i *) aligned); + __m128i value = _mm_load_si128 ((__m128i *) aligned); switch (offset) { @@ -263,12 +259,12 @@ __strspn_sse42 (const char *s, const char *a) break; } - length = _mm_cmpistri (mask, value, 0x12); + int length = _mm_cmpistri (mask, value, 0x12); /* No need to check CFlag since it is always 1. */ if (length < 16 - offset) return length; /* Find where the NULL terminator is. */ - index = _mm_cmpistri (value, value, 0x3a); + int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) return length; aligned += 16; @@ -276,12 +272,13 @@ __strspn_sse42 (const char *s, const char *a) else aligned = s; -loop: - value = _mm_load_si128 ((__m128i *) aligned); - index = _mm_cmpistri (mask, value, 0x12); - cflag = _mm_cmpistrc (mask, value, 0x12); - if (cflag) - return (size_t) (aligned + index - s); - aligned += 16; - goto loop; + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } } From 2123d5815eaaa53a772be4b6069bdc6332f2f491 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Sun, 5 Jul 2009 23:46:03 -0700 Subject: [PATCH 09/50] Fix wrong PPC_FEATURE_* values. Nothing uses these wrong values yet, but it fixes a warning due to conflicting definitions in . --- ChangeLog | 5 +++++ sysdeps/powerpc/sysdep.h | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2e0549e10a..9d9828c4ff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-03 Andreas Schwab + + * sysdeps/powerpc/sysdep.h (PPC_FEATURE_ARCH_2_06): Fix value. + (PPC_FEATURE_HAS_VSX): Likewise. + 2009-07-03 Ulrich Drepper * sysdeps/x86_64/multiarch/strcspn-c.c: Minor cleanups. diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h index 43edeb71eb..f5c79c54ef 100644 --- a/sysdeps/powerpc/sysdep.h +++ b/sysdeps/powerpc/sysdep.h @@ -44,8 +44,8 @@ #define PPC_FEATURE_PA6T 0x00000800 /* PA Semi 6T Core */ #define PPC_FEATURE_HAS_DFP 0x00000400 /* Decimal FP Unit */ #define PPC_FEATURE_POWER6_EXT 0x00000200 /* P6 + mffgpr/mftgpr */ -#define PPC_FEATURE_HAS_VSX 0x00000100 /* P7 Vector Extension. */ -#define PPC_FEATURE_ARCH_2_06 0x00000080 /* ISA 2.06 */ +#define PPC_FEATURE_ARCH_2_06 0x00000100 /* ISA 2.06 */ +#define PPC_FEATURE_HAS_VSX 0x00000080 /* P7 Vector Extension. */ #define PPC_FEATURE_970 (PPC_FEATURE_POWER4 + PPC_FEATURE_HAS_ALTIVEC) #ifdef __ASSEMBLER__ From a38862a58910a5209c9ac3baae5797fbbedbeb1c Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Mon, 6 Jul 2009 06:55:57 -0700 Subject: [PATCH 10/50] Optimize test for valid ELF symbol types in lookup function. --- ChangeLog | 4 ++++ elf/do-lookup.h | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9d9828c4ff..c364e5e6b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2009-07-06 Ulrich Drepper + + * elf/do-lookup.h (ALLOWED_STT): Optimize test for valid symbol types. + 2009-07-03 Andreas Schwab * sysdeps/powerpc/sysdep.h (PPC_FEATURE_ARCH_2_06): Fix value. diff --git a/elf/do-lookup.h b/elf/do-lookup.h index ae74da4846..acbc53dbbe 100644 --- a/elf/do-lookup.h +++ b/elf/do-lookup.h @@ -87,13 +87,13 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, 0)) return NULL; - if (__builtin_expect (stt > STT_FUNC - && stt != STT_COMMON - && stt != STT_TLS - && stt != STT_GNU_IFUNC, 0)) - /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, STT_COMMON, - STT_TLS, and STT_GNU_IFUNC since these are no code/data - definitions. */ + /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, + STT_COMMON, STT_TLS, and STT_GNU_IFUNC since these are no + code/data definitions. */ +#define ALLOWED_STT \ + ((1 << STT_NOTYPE) | (1 << STT_OBJECT) | (1 << STT_FUNC) \ + | (1 << STT_COMMON) | (1 << STT_TLS) | (1 << STT_GNU_IFUNC)) + if (__builtin_expect (((1 << stt) & ALLOWED_STT) == 0, 0)) return NULL; if (sym != ref && strcmp (strtab + sym->st_name, undef_name)) From eba0994e75e622ad30c6dcdb53e5ddedd043f6d7 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 7 Jul 2009 09:49:55 -0700 Subject: [PATCH 11/50] Clean up code for hash table handling in ld.so. --- ChangeLog | 7 ++++ elf/dl-misc.c | 64 +++++++++++++++++++++++++++++++++++ include/inline-hashtab.h | 72 ++-------------------------------------- 3 files changed, 74 insertions(+), 69 deletions(-) diff --git a/ChangeLog b/ChangeLog index c364e5e6b2..d83ca214d1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2009-07-07 Ulrich Drepper + + * elf/dl-misc.c (_dl_higher_prime_number): New function. Moved here + from... + * include/inline-hashtab.h: ...here. + (htab_expand): Adjust for renamed function. Correct memory handling. + 2009-07-06 Ulrich Drepper * elf/do-lookup.h (ALLOWED_STT): Optimize test for valid symbol types. diff --git a/elf/dl-misc.c b/elf/dl-misc.c index 7c77cd040f..7d4e1a1725 100644 --- a/elf/dl-misc.c +++ b/elf/dl-misc.c @@ -312,3 +312,67 @@ _dl_name_match_p (const char *name, const struct link_map *map) return 0; } + + +unsigned long int +_dl_higher_prime_number (unsigned long int n) +{ + /* These are primes that are near, but slightly smaller than, a + power of two. */ + static const uint32_t primes[] = { + UINT32_C (7), + UINT32_C (13), + UINT32_C (31), + UINT32_C (61), + UINT32_C (127), + UINT32_C (251), + UINT32_C (509), + UINT32_C (1021), + UINT32_C (2039), + UINT32_C (4093), + UINT32_C (8191), + UINT32_C (16381), + UINT32_C (32749), + UINT32_C (65521), + UINT32_C (131071), + UINT32_C (262139), + UINT32_C (524287), + UINT32_C (1048573), + UINT32_C (2097143), + UINT32_C (4194301), + UINT32_C (8388593), + UINT32_C (16777213), + UINT32_C (33554393), + UINT32_C (67108859), + UINT32_C (134217689), + UINT32_C (268435399), + UINT32_C (536870909), + UINT32_C (1073741789), + UINT32_C (2147483647), + /* 4294967291L */ + UINT32_C (2147483647) + UINT32_C (2147483644) + }; + + const uint32_t *low = &primes[0]; + const uint32_t *high = &primes[sizeof (primes) / sizeof (primes[0])]; + + while (low != high) + { + const uint32_t *mid = low + (high - low) / 2; + if (n > *mid) + low = mid + 1; + else + high = mid; + } + +#if 0 + /* If we've run out of primes, abort. */ + if (n > *low) + { + fprintf (stderr, "Cannot find prime bigger than %lu\n", n); + abort (); + } +#endif + + return *low; +} diff --git a/include/inline-hashtab.h b/include/inline-hashtab.h index c359161c54..0f6719b91c 100644 --- a/include/inline-hashtab.h +++ b/include/inline-hashtab.h @@ -1,7 +1,5 @@ /* Fully-inline hash table, used mainly for managing TLS descriptors. - - Copyright (C) 1999, 2000, 2001, 2002, 2003, 2005, 2008 - Free Software Foundation, Inc. + Copyright (C) 1999-2003, 2005, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Alexandre Oliva @@ -30,69 +28,6 @@ extern void weak_function free (void *ptr); -inline static unsigned long -higher_prime_number (unsigned long n) -{ - /* These are primes that are near, but slightly smaller than, a - power of two. */ - static const uint32_t primes[] = { - UINT32_C (7), - UINT32_C (13), - UINT32_C (31), - UINT32_C (61), - UINT32_C (127), - UINT32_C (251), - UINT32_C (509), - UINT32_C (1021), - UINT32_C (2039), - UINT32_C (4093), - UINT32_C (8191), - UINT32_C (16381), - UINT32_C (32749), - UINT32_C (65521), - UINT32_C (131071), - UINT32_C (262139), - UINT32_C (524287), - UINT32_C (1048573), - UINT32_C (2097143), - UINT32_C (4194301), - UINT32_C (8388593), - UINT32_C (16777213), - UINT32_C (33554393), - UINT32_C (67108859), - UINT32_C (134217689), - UINT32_C (268435399), - UINT32_C (536870909), - UINT32_C (1073741789), - UINT32_C (2147483647), - /* 4294967291L */ - UINT32_C (2147483647) + UINT32_C (2147483644) - }; - - const uint32_t *low = &primes[0]; - const uint32_t *high = &primes[sizeof (primes) / sizeof (primes[0])]; - - while (low != high) - { - const uint32_t *mid = low + (high - low) / 2; - if (n > *mid) - low = mid + 1; - else - high = mid; - } - -#if 0 - /* If we've run out of primes, abort. */ - if (n > *low) - { - fprintf (stderr, "Cannot find prime bigger than %lu\n", n); - abort (); - } -#endif - - return *low; -} - struct hashtab { /* Table itself. */ @@ -203,12 +138,11 @@ htab_expand (struct hashtab *htab, int (*hash_fn) (void *)) /* Resize only when table after removal of unused elements is either too full or too empty. */ if (htab->n_elements * 2 > htab->size) - nsize = higher_prime_number (htab->n_elements * 2); + nsize = _dl_higher_prime_number (htab->n_elements * 2); else nsize = htab->size; - nentries = malloc (sizeof (void *) * nsize); - memset (nentries, 0, sizeof (void *) * nsize); + nentries = calloc (sizeof (void *), nsize); if (nentries == NULL) return 0; htab->entries = nentries; From b4f55afd031f14531ba7681032fc5f75a1578320 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 7 Jul 2009 09:53:01 -0700 Subject: [PATCH 12/50] Define STB_GNU_UNIQUE. --- ChangeLog | 2 ++ elf/elf.h | 1 + 2 files changed, 3 insertions(+) diff --git a/ChangeLog b/ChangeLog index d83ca214d1..f06f610d36 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2009-07-07 Ulrich Drepper + * elf/elf.h (STB_GNU_UNIQUE): Define. + * elf/dl-misc.c (_dl_higher_prime_number): New function. Moved here from... * include/inline-hashtab.h: ...here. diff --git a/elf/elf.h b/elf/elf.h index 8fdf74b099..7efdedefb4 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -444,6 +444,7 @@ typedef struct #define STB_WEAK 2 /* Weak symbol */ #define STB_NUM 3 /* Number of defined types. */ #define STB_LOOS 10 /* Start of OS-specific */ +#define STB_GNU_UNIQUE 10 /* Unique symbol. */ #define STB_HIOS 12 /* End of OS-specific */ #define STB_LOPROC 13 /* Start of processor-specific */ #define STB_HIPROC 15 /* End of processor-specific */ From 415ac3df9b10ae426d4f71f9d48003f6a3c7bd8d Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 9 Jul 2009 23:52:22 -0700 Subject: [PATCH 13/50] Implement STB_GNU_UNIQUE handling. Some symbols have to be identified process-wide by their name. This is particularly important for some C++ features (e.g., class local static data and static variables in inline functions). This cannot completely be implemented with ELF functionality so far. The STB_GNU_UNIQUE binding helps by ensuring the dynamic linker will always use the same definition for all symbols with the same name and this binding. --- ChangeLog | 24 +++++++- config.h.in | 3 + configure | 26 ++++++++ configure.in | 17 ++++++ elf/Makefile | 18 +++++- elf/dl-lookup.c | 6 +- elf/dl-open.c | 7 ++- elf/do-lookup.h | 118 ++++++++++++++++++++++++++++++++++++- elf/rtld.c | 7 ++- elf/tst-unique1.c | 40 +++++++++++++ elf/tst-unique1mod1.c | 21 +++++++ elf/tst-unique1mod2.c | 20 +++++++ elf/tst-unique2.c | 32 ++++++++++ elf/tst-unique2mod1.c | 13 ++++ elf/tst-unique2mod2.c | 20 +++++++ sysdeps/generic/ldsodefs.h | 15 +++++ 16 files changed, 375 insertions(+), 12 deletions(-) create mode 100644 elf/tst-unique1.c create mode 100644 elf/tst-unique1mod1.c create mode 100644 elf/tst-unique1mod2.c create mode 100644 elf/tst-unique2.c create mode 100644 elf/tst-unique2mod1.c create mode 100644 elf/tst-unique2mod2.c diff --git a/ChangeLog b/ChangeLog index f06f610d36..bdb320fd25 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +2009-07-09 Ulrich Drepper + + * configure.in: Check for gnu_unique_symbol symbol type. + * config.h.in: Add HAVE_ASM_UNIQUE_OBJECT entry. + * elf/do-lookup.h (do_lookup_x): Take new parameter with link map of + the undefined symbol. Handle STB_GNU_UNIQUE binding of found symbol. + * elf/dl-lookup.c (_dl_lookup_symbol_x): Adjust callers for do_lookup_x + change. + * sysdeps/generic/ldsodefs.h (struct rtld_global): Add definitions for + unique symbol table. + * elf/rtld.c (rtld_global): Initialize lock of unique symbol hash table + for first namespace. + * elf/dl-open.c (_dl_open): For new namespace, initialize lock for + unique symbol hash table. + * elf/Makefile: Add rules to build and run tst-unique1 and tst-unique2. + * elf/tst-unique1.c: New file. + * elf/tst-unique1mod1.c: New file. + * elf/tst-unique1mod2.c: New file. + * elf/tst-unique2.c: New file. + * elf/tst-unique2mod1.c: New file. + * elf/tst-unique2mod2.c: New file. + 2009-07-07 Ulrich Drepper * elf/elf.h (STB_GNU_UNIQUE): Define. @@ -9,7 +31,7 @@ 2009-07-06 Ulrich Drepper - * elf/do-lookup.h (ALLOWED_STT): Optimize test for valid symbol types. + * elf/do-lookup.h (do_lookup_x): Optimize test for valid symbol types. 2009-07-03 Andreas Schwab diff --git a/config.h.in b/config.h.in index 4ddab7d775..5f16874584 100644 --- a/config.h.in +++ b/config.h.in @@ -59,6 +59,9 @@ assembler's `.type' directive, if it has one. */ #undef ASM_TYPE_DIRECTIVE_PREFIX +/* Define if the assembler supports the gnu_unique_object symbol type. */ +#undef HAVE_ASM_UNIQUE_OBJECT + /* Define a symbol_name as a global .symbol_name for ld. */ #undef HAVE_ASM_GLOBAL_DOT_NAME diff --git a/configure b/configure index e30778fd94..4e49f7068b 100755 --- a/configure +++ b/configure @@ -5994,6 +5994,32 @@ _ACEOF fi +{ $as_echo "$as_me:$LINENO: checking for assembler gnu_unique_object symbol type" >&5 +$as_echo_n "checking for assembler gnu_unique_object symbol type... " >&6; } +if test "${libc_cv_asm_unique_object+set}" = set; then + $as_echo_n "(cached) " >&6 +else + cat > conftest.s <&5 2>&5; then + libc_cv_asm_unique_object=yes +else + libc_cv_asm_unique_object=no +fi +rm -f conftest* +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_asm_unique_object" >&5 +$as_echo "$libc_cv_asm_unique_object" >&6; } +if test $libc_cv_asm_unique_object = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_ASM_UNIQUE_OBJECT 1 +_ACEOF + +fi + # For the multi-arch option we need support in the assembler. if test "$multi_arch" = yes; then if test "x$libc_cv_asm_type_prefix" != xno; then diff --git a/configure.in b/configure.in index 216cdc9d07..61c87418e6 100644 --- a/configure.in +++ b/configure.in @@ -1211,6 +1211,23 @@ if test "x$libc_cv_asm_type_prefix" != xno; then AC_DEFINE_UNQUOTED(ASM_TYPE_DIRECTIVE_PREFIX, ${libc_cv_asm_type_prefix}) fi +AC_CACHE_CHECK(for assembler gnu_unique_object symbol type, + libc_cv_asm_unique_object, [dnl +cat > conftest.s <&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD; then + libc_cv_asm_unique_object=yes +else + libc_cv_asm_unique_object=no +fi +rm -f conftest*]) +if test $libc_cv_asm_unique_object = yes; then + AC_DEFINE(HAVE_ASM_UNIQUE_OBJECT) +fi + # For the multi-arch option we need support in the assembler. if test "$multi_arch" = yes; then if test "x$libc_cv_asm_type_prefix" != xno; then diff --git a/elf/Makefile b/elf/Makefile index 57febea483..cc5caeb521 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -111,7 +111,9 @@ distribute := rtld-Rules \ ifuncdep5.c ifuncdep5pic.c ifuncmod5.c \ ifuncmain6pie.c ifuncmod6.c \ ifuncmain7.c ifuncmain7pic.c ifuncmain7picstatic.c \ - ifuncmain7pie.c ifuncmain7static.c + ifuncmain7pie.c ifuncmain7static.c \ + tst-unique1.c tst-unique1mod1.c tst-unique1mod2.c \ + tst-unique2.c tst-unique2mod1.c tst-unique2mod2.c CFLAGS-dl-runtime.c = -fexceptions -fasynchronous-unwind-tables CFLAGS-dl-lookup.c = -fexceptions -fasynchronous-unwind-tables @@ -190,7 +192,8 @@ tests += loadtest restest1 preloadtest loadfail multiload origtest resolvfail \ tst-dlmopen1 tst-dlmopen2 tst-dlmopen3 \ unload3 unload4 unload5 unload6 unload7 tst-global1 order2 \ tst-audit1 tst-audit2 \ - tst-stackguard1 tst-addr1 tst-thrlock + tst-stackguard1 tst-addr1 tst-thrlock \ + tst-unique1 tst-unique2 # reldep9 test-srcs = tst-pathopt tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog @@ -239,7 +242,9 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \ unload4mod1 unload4mod2 unload4mod3 unload4mod4 \ unload6mod1 unload6mod2 unload6mod3 \ unload7mod1 unload7mod2 \ - order2mod1 order2mod2 order2mod3 order2mod4 + order2mod1 order2mod2 order2mod3 order2mod4 \ + tst-unique1mod1 tst-unique1mod2 \ + tst-unique2mod1 tst-unique2mod2 ifeq (yes,$(have-initfini-array)) modules-names += tst-array2dep tst-array5dep endif @@ -1103,3 +1108,10 @@ $(objpfx)ifuncmain5pic: $(addprefix $(objpfx),ifuncmod5.so) $(objpfx)ifuncmain5static: $(addprefix $(objpfx),ifuncdep5.o) $(objpfx)ifuncmain5staticpic: $(addprefix $(objpfx),ifuncdep5pic.o) $(objpfx)ifuncmain5picstatic: $(addprefix $(objpfx),ifuncdep5pic.o) + +$(objpfx)tst-unique1: $(libdl) +$(objpfx)tst-unique1.out: $(objpfx)tst-unique1mod1.so \ + $(objpfx)tst-unique1mod2.so + +$(objpfx)tst-unique2: $(libdl) $(objpfx)tst-unique2mod1.so +$(objpfx)tst-unique2.out: $(objpfx)tst-unique2mod2.so diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 707d650719..2ba885a639 100644 --- a/elf/dl-lookup.c +++ b/elf/dl-lookup.c @@ -337,7 +337,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map, { int res = do_lookup_x (undef_name, new_hash, &old_hash, *ref, ¤t_value, *scope, start, version, flags, - skip_map, type_class); + skip_map, type_class, undef_map); if (res > 0) break; @@ -410,7 +410,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map, for (scope = symbol_scope; *scope != NULL; i = 0, ++scope) if (do_lookup_x (undef_name, new_hash, &old_hash, *ref, &protected_value, *scope, i, version, flags, - skip_map, ELF_RTYPE_CLASS_PLT) != 0) + skip_map, ELF_RTYPE_CLASS_PLT, NULL) != 0) break; if (protected_value.s != NULL && protected_value.m != undef_map) @@ -536,7 +536,7 @@ _dl_debug_bindings (const char *undef_name, struct link_map *undef_map, do_lookup_x (undef_name, new_hash, &old_hash, *ref, &val, undef_map->l_local_scope[0], 0, version, 0, NULL, - type_class); + type_class, undef_map); if (val.s != value->s || val.m != value->m) conflict = 1; diff --git a/elf/dl-open.c b/elf/dl-open.c index c3f0e42d5e..b8ebfe0e60 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -569,7 +569,7 @@ _dl_open (const char *file, int mode, const void *caller_dlopen, Lmid_t nsid, if (GL(dl_ns)[nsid]._ns_loaded == NULL) break; - if (nsid == DL_NNS) + if (__builtin_expect (nsid == DL_NNS, 0)) { /* No more namespace available. */ __rtld_lock_unlock_recursive (GL(dl_load_lock)); @@ -579,7 +579,10 @@ no more namespaces available for dlmopen()")); } if (nsid == GL(dl_nns)) - ++GL(dl_nns); + { + __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock); + ++GL(dl_nns); + } _dl_debug_initialize (0, nsid)->r_state = RT_CONSISTENT; } diff --git a/elf/do-lookup.h b/elf/do-lookup.h index acbc53dbbe..782f490964 100644 --- a/elf/do-lookup.h +++ b/elf/do-lookup.h @@ -27,7 +27,7 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, unsigned long int *old_hash, const ElfW(Sym) *ref, struct sym_val *result, struct r_scope_elem *scope, size_t i, const struct r_found_version *const version, int flags, - struct link_map *skip, int type_class) + struct link_map *skip, int type_class, struct link_map *undef_map) { size_t n = scope->r_nlist; /* Make sure we read the value before proceeding. Otherwise we @@ -233,7 +233,7 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, if (sym != NULL) { found_it: - switch (ELFW(ST_BIND) (sym->st_info)) + switch (__builtin_expect (ELFW(ST_BIND) (sym->st_info), STB_GLOBAL)) { case STB_WEAK: /* Weak definition. Use this value if we don't find another. */ @@ -248,10 +248,124 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, } /* FALLTHROUGH */ case STB_GLOBAL: + success: /* Global definition. Just what we need. */ result->s = sym; result->m = (struct link_map *) map; return 1; + + case STB_GNU_UNIQUE:; + /* We have to determine whether we already found a + symbol with this name before. If not then we have to + add it to the search table. If we already found a + definition we have to use it. */ + void enter (struct unique_sym *table, size_t size, + unsigned int hash, const char *name, + const ElfW(Sym) *sym, const struct link_map *map) + { + size_t idx = hash % size; + size_t hash2 = 1 + hash % (size - 2); + while (1) + { + if (table[idx].hashval == 0) + { + table[idx].hashval = hash; + table[idx].name = strtab + sym->st_name; + if ((type_class & ELF_RTYPE_CLASS_COPY) != 0) + { + table[idx].sym = ref; + table[idx].map = undef_map; + } + else + { + table[idx].sym = sym; + table[idx].map = map; + } + return; + } + + idx += hash2; + if (idx >= size) + idx -= size; + } + } + + struct unique_sym_table *tab + = &GL(dl_ns)[map->l_ns]._ns_unique_sym_table; + + __rtld_lock_lock_recursive (tab->lock); + + struct unique_sym *entries = tab->entries; + size_t size = tab->size; + if (entries != NULL) + { + size_t idx = new_hash % size; + size_t hash2 = 1 + new_hash % (size - 2); + while (1) + { + if (entries[idx].hashval == new_hash + && strcmp (entries[idx].name, undef_name) == 0) + { + result->s = entries[idx].sym; + result->m = (struct link_map *) entries[idx].map; + __rtld_lock_unlock_recursive (tab->lock); + return 1; + } + + if (entries[idx].hashval == 0 + && entries[idx].name == NULL) + break; + + idx += hash2; + if (idx >= size) + idx -= size; + } + + if (size * 3 <= tab->n_elements) + { + /* Expand the table. */ + size_t newsize = _dl_higher_prime_number (size); + struct unique_sym *newentries + = calloc (sizeof (struct unique_sym), newsize); + if (newentries == NULL) + { + nomem: + __rtld_lock_unlock_recursive (tab->lock); + _dl_fatal_printf ("out of memory\n"); + } + + for (idx = 0; idx < size; ++idx) + if (entries[idx].hashval != 0) + enter (newentries, newsize, entries[idx].hashval, + entries[idx].name, entries[idx].sym, + entries[idx].map); + + tab->free (entries); + tab->size = newsize; + entries = tab->entries = newentries; + tab->free = free; + } + } + else + { +#define INITIAL_NUNIQUE_SYM_TABLE 31 + size = INITIAL_NUNIQUE_SYM_TABLE; + entries = calloc (sizeof (struct unique_sym), size); + if (entries == NULL) + goto nomem; + + tab->entries = entries; + tab->size = size; + tab->free = free; + } + + enter (entries, size, new_hash, strtab + sym->st_name, sym, map); + ++tab->n_elements; + + __rtld_lock_unlock_recursive (tab->lock); + + goto success; + default: /* Local symbols are ignored. */ break; diff --git a/elf/rtld.c b/elf/rtld.c index f97de9ac08..55b84c3bf4 100644 --- a/elf/rtld.c +++ b/elf/rtld.c @@ -127,7 +127,12 @@ struct rtld_global _rtld_global = #ifdef _LIBC_REENTRANT ._dl_load_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER, #endif - ._dl_nns = 1 + ._dl_nns = 1, + ._dl_ns = + { + [LM_ID_BASE] = { ._ns_unique_sym_table + = { .lock = _RTLD_LOCK_RECURSIVE_INITIALIZER } } + } }; /* If we would use strong_alias here the compiler would see a non-hidden definition. This would undo the effect of the previous diff --git a/elf/tst-unique1.c b/elf/tst-unique1.c new file mode 100644 index 0000000000..9b7996cd96 --- /dev/null +++ b/elf/tst-unique1.c @@ -0,0 +1,40 @@ +#include +#include +#include + +static int +do_test (void) +{ +#ifdef HAVE_ASM_UNIQUE_OBJECT + void *h1 = dlopen ("tst-unique1mod1.so", RTLD_LAZY); + if (h1 == NULL) + { + puts ("cannot load tst-unique1mod1"); + return 1; + } + int *(*f1) (void) = dlsym (h1, "f"); + if (f1 == NULL) + { + puts ("cannot locate f in tst-unique1mod1"); + return 1; + } + void *h2 = dlopen ("tst-unique1mod2.so", RTLD_LAZY); + if (h2 == NULL) + { + puts ("cannot load tst-unique1mod2"); + return 1; + } + int (*f2) (int *) = dlsym (h2, "f"); + if (f2 == NULL) + { + puts ("cannot locate f in tst-unique1mod2"); + return 1; + } + return f2 (f1 ()); +#else + return 0; +#endif +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/elf/tst-unique1mod1.c b/elf/tst-unique1mod1.c new file mode 100644 index 0000000000..16de28d25e --- /dev/null +++ b/elf/tst-unique1mod1.c @@ -0,0 +1,21 @@ +#include + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +extern int var; + +int * +f (void) +{ + var = 1; + return &var; +} +#endif diff --git a/elf/tst-unique1mod2.c b/elf/tst-unique1mod2.c new file mode 100644 index 0000000000..c075515827 --- /dev/null +++ b/elf/tst-unique1mod2.c @@ -0,0 +1,20 @@ +#include + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +extern int var; + +int +f (int *p) +{ + return &var != p || *p != 1; +} +#endif diff --git a/elf/tst-unique2.c b/elf/tst-unique2.c new file mode 100644 index 0000000000..7bb0687364 --- /dev/null +++ b/elf/tst-unique2.c @@ -0,0 +1,32 @@ +#include +#include +#include + +extern int var; + +static int +do_test (void) +{ +#ifdef HAVE_ASM_UNIQUE_OBJECT + var = 1; + + void *h = dlopen ("tst-unique2mod2.so", RTLD_LAZY); + if (h == NULL) + { + puts ("cannot load tst-unique2mod2"); + return 1; + } + int (*f) (int *) = dlsym (h, "f"); + if (f == NULL) + { + puts ("cannot locate f in tst-unique2mod2"); + return 1; + } + return f (&var); +#else + return 0; +#endif +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/elf/tst-unique2mod1.c b/elf/tst-unique2mod1.c new file mode 100644 index 0000000000..5e4ac4d68c --- /dev/null +++ b/elf/tst-unique2mod1.c @@ -0,0 +1,13 @@ +#include + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +#endif diff --git a/elf/tst-unique2mod2.c b/elf/tst-unique2mod2.c new file mode 100644 index 0000000000..c075515827 --- /dev/null +++ b/elf/tst-unique2mod2.c @@ -0,0 +1,20 @@ +#include + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +extern int var; + +int +f (int *p) +{ + return &var != p || *p != 1; +} +#endif diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index b1af7fde0a..1e1bb4ccd4 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -383,6 +383,21 @@ struct rtld_global allocated by rtld. Later it keeps the size of the map. It might be reset if in _dl_close if the last global object is removed. */ size_t _ns_global_scope_alloc; + /* Search table for unique objects. */ + struct unique_sym_table + { + __rtld_lock_recursive_t lock; + struct unique_sym + { + uint32_t hashval; + const char *name; + const ElfW(Sym) *sym; + const struct link_map *map; + } *entries; + size_t size; + size_t n_elements; + void (*free) (void *); + } _ns_unique_sym_table; /* Keep track of changes to each namespace' list. */ struct r_debug _ns_debug; } _dl_ns[DL_NNS]; From 339717d5f117d118750ec187b1779cafa349249e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 10 Jul 2009 06:14:25 -0700 Subject: [PATCH 14/50] Fix comment in Linux's . --- ChangeLog | 4 ++++ sysdeps/unix/sysv/linux/sys/epoll.h | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index bdb320fd25..3a027d6d04 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2009-07-10 Ulrich Drepper + + * sysdeps/unix/sysv/linux/sys/epoll.h: Fix comment. + 2009-07-09 Ulrich Drepper * configure.in: Check for gnu_unique_symbol symbol type. diff --git a/sysdeps/unix/sysv/linux/sys/epoll.h b/sysdeps/unix/sysv/linux/sys/epoll.h index 12de0bcfe2..ca1d3d0459 100644 --- a/sysdeps/unix/sysv/linux/sys/epoll.h +++ b/sysdeps/unix/sysv/linux/sys/epoll.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2006, 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,7 +31,7 @@ typedef __sigset_t sigset_t; #endif -/* Flags to be passed to epoll_create2. */ +/* Flags to be passed to epoll_create1. */ enum { EPOLL_CLOEXEC = 02000000, From 786b74f41a076ac67b5d4fe59ab26e55745095df Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 10 Jul 2009 08:50:33 -0700 Subject: [PATCH 15/50] Remove do-lookup.h. It is not necessary/useful anymore to have the content separate from dl-lookup.c. --- ChangeLog | 3 + elf/Makefile | 2 +- elf/dl-lookup.c | 367 ++++++++++++++++++++++++++++++++++++++++++++- elf/do-lookup.h | 385 ------------------------------------------------ 4 files changed, 369 insertions(+), 388 deletions(-) delete mode 100644 elf/do-lookup.h diff --git a/ChangeLog b/ChangeLog index 3a027d6d04..04760b065c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2009-07-10 Ulrich Drepper + * elf/do-lookup.h: Removed after folding content into... + * elf/dl-lookup.c: ...here. + * sysdeps/unix/sysv/linux/sys/epoll.h: Fix comment. 2009-07-09 Ulrich Drepper diff --git a/elf/Makefile b/elf/Makefile index cc5caeb521..3e656ae0ad 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -47,7 +47,7 @@ distribute := rtld-Rules \ dl-cache.h dl-hash.h soinit.c sofini.c ldd.bash.in \ genrtldtbl.awk atomicity.h dl-procinfo.h ldsodefs.h \ dl-librecon.h interp.c sln.c dl-dst.h hp-timing.h \ - do-lookup.h dl-lookupcfg.h sprof.c gen-trusted-dirs.awk \ + dl-lookupcfg.h sprof.c gen-trusted-dirs.awk \ testobj1.c testobj2.c testobj3.c testobj4.c testobj5.c \ testobj6.c testobj1_1.c failobj.c unloadmod.c \ ldconfig.h ldconfig.c cache.c readlib.c readelflib.c \ diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 2ba885a639..fd0d624430 100644 --- a/elf/dl-lookup.c +++ b/elf/dl-lookup.c @@ -69,8 +69,371 @@ struct sym_val #endif -/* The actual lookup code. */ -#include "do-lookup.h" +/* Inner part of the lookup functions. We return a value > 0 if we + found the symbol, the value 0 if nothing is found and < 0 if + something bad happened. */ +static int +__attribute_noinline__ +do_lookup_x (const char *undef_name, uint_fast32_t new_hash, + unsigned long int *old_hash, const ElfW(Sym) *ref, + struct sym_val *result, struct r_scope_elem *scope, size_t i, + const struct r_found_version *const version, int flags, + struct link_map *skip, int type_class, struct link_map *undef_map) +{ + size_t n = scope->r_nlist; + /* Make sure we read the value before proceeding. Otherwise we + might use r_list pointing to the initial scope and r_nlist being + the value after a resize. That is the only path in dl-open.c not + protected by GSCOPE. A read barrier here might be to expensive. */ + __asm volatile ("" : "+r" (n), "+m" (scope->r_list)); + struct link_map **list = scope->r_list; + + do + { + /* These variables are used in the nested function. */ + Elf_Symndx symidx; + int num_versions = 0; + const ElfW(Sym) *versioned_sym = NULL; + + const struct link_map *map = list[i]->l_real; + + /* Here come the extra test needed for `_dl_lookup_symbol_skip'. */ + if (map == skip) + continue; + + /* Don't search the executable when resolving a copy reloc. */ + if ((type_class & ELF_RTYPE_CLASS_COPY) && map->l_type == lt_executable) + continue; + + /* Do not look into objects which are going to be removed. */ + if (map->l_removed) + continue; + + /* Print some debugging info if wanted. */ + if (__builtin_expect (GLRO(dl_debug_mask) & DL_DEBUG_SYMBOLS, 0)) + _dl_debug_printf ("symbol=%s; lookup in file=%s [%lu]\n", + undef_name, + map->l_name[0] ? map->l_name : rtld_progname, + map->l_ns); + + /* If the hash table is empty there is nothing to do here. */ + if (map->l_nbuckets == 0) + continue; + + /* The tables for this map. */ + const ElfW(Sym) *symtab = (const void *) D_PTR (map, l_info[DT_SYMTAB]); + const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]); + + + /* Nested routine to check whether the symbol matches. */ + const ElfW(Sym) * + __attribute_noinline__ + check_match (const ElfW(Sym) *sym) + { + unsigned int stt = ELFW(ST_TYPE) (sym->st_info); + assert (ELF_RTYPE_CLASS_PLT == 1); + if (__builtin_expect ((sym->st_value == 0 /* No value. */ + && stt != STT_TLS) + || (type_class & (sym->st_shndx == SHN_UNDEF)), + 0)) + return NULL; + + /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, + STT_COMMON, STT_TLS, and STT_GNU_IFUNC since these are no + code/data definitions. */ +#define ALLOWED_STT \ + ((1 << STT_NOTYPE) | (1 << STT_OBJECT) | (1 << STT_FUNC) \ + | (1 << STT_COMMON) | (1 << STT_TLS) | (1 << STT_GNU_IFUNC)) + if (__builtin_expect (((1 << stt) & ALLOWED_STT) == 0, 0)) + return NULL; + + if (sym != ref && strcmp (strtab + sym->st_name, undef_name)) + /* Not the symbol we are looking for. */ + return NULL; + + const ElfW(Half) *verstab = map->l_versyms; + if (version != NULL) + { + if (__builtin_expect (verstab == NULL, 0)) + { + /* We need a versioned symbol but haven't found any. If + this is the object which is referenced in the verneed + entry it is a bug in the library since a symbol must + not simply disappear. + + It would also be a bug in the object since it means that + the list of required versions is incomplete and so the + tests in dl-version.c haven't found a problem.*/ + assert (version->filename == NULL + || ! _dl_name_match_p (version->filename, map)); + + /* Otherwise we accept the symbol. */ + } + else + { + /* We can match the version information or use the + default one if it is not hidden. */ + ElfW(Half) ndx = verstab[symidx] & 0x7fff; + if ((map->l_versions[ndx].hash != version->hash + || strcmp (map->l_versions[ndx].name, version->name)) + && (version->hidden || map->l_versions[ndx].hash + || (verstab[symidx] & 0x8000))) + /* It's not the version we want. */ + return NULL; + } + } + else + { + /* No specific version is selected. There are two ways we + can got here: + + - a binary which does not include versioning information + is loaded + + - dlsym() instead of dlvsym() is used to get a symbol which + might exist in more than one form + + If the library does not provide symbol version information + there is no problem at at: we simply use the symbol if it + is defined. + + These two lookups need to be handled differently if the + library defines versions. In the case of the old + unversioned application the oldest (default) version + should be used. In case of a dlsym() call the latest and + public interface should be returned. */ + if (verstab != NULL) + { + if ((verstab[symidx] & 0x7fff) + >= ((flags & DL_LOOKUP_RETURN_NEWEST) ? 2 : 3)) + { + /* Don't accept hidden symbols. */ + if ((verstab[symidx] & 0x8000) == 0 + && num_versions++ == 0) + /* No version so far. */ + versioned_sym = sym; + + return NULL; + } + } + } + + /* There cannot be another entry for this symbol so stop here. */ + return sym; + } + + const ElfW(Sym) *sym; + const ElfW(Addr) *bitmask = map->l_gnu_bitmask; + if (__builtin_expect (bitmask != NULL, 1)) + { + ElfW(Addr) bitmask_word + = bitmask[(new_hash / __ELF_NATIVE_CLASS) + & map->l_gnu_bitmask_idxbits]; + + unsigned int hashbit1 = new_hash & (__ELF_NATIVE_CLASS - 1); + unsigned int hashbit2 = ((new_hash >> map->l_gnu_shift) + & (__ELF_NATIVE_CLASS - 1)); + + if (__builtin_expect ((bitmask_word >> hashbit1) + & (bitmask_word >> hashbit2) & 1, 0)) + { + Elf32_Word bucket = map->l_gnu_buckets[new_hash + % map->l_nbuckets]; + if (bucket != 0) + { + const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket]; + + do + if (((*hasharr ^ new_hash) >> 1) == 0) + { + symidx = hasharr - map->l_gnu_chain_zero; + sym = check_match (&symtab[symidx]); + if (sym != NULL) + goto found_it; + } + while ((*hasharr++ & 1u) == 0); + } + } + /* No symbol found. */ + symidx = SHN_UNDEF; + } + else + { + if (*old_hash == 0xffffffff) + *old_hash = _dl_elf_hash (undef_name); + + /* Use the old SysV-style hash table. Search the appropriate + hash bucket in this object's symbol table for a definition + for the same symbol name. */ + for (symidx = map->l_buckets[*old_hash % map->l_nbuckets]; + symidx != STN_UNDEF; + symidx = map->l_chain[symidx]) + { + sym = check_match (&symtab[symidx]); + if (sym != NULL) + goto found_it; + } + } + + /* If we have seen exactly one versioned symbol while we are + looking for an unversioned symbol and the version is not the + default version we still accept this symbol since there are + no possible ambiguities. */ + sym = num_versions == 1 ? versioned_sym : NULL; + + if (sym != NULL) + { + found_it: + switch (__builtin_expect (ELFW(ST_BIND) (sym->st_info), STB_GLOBAL)) + { + case STB_WEAK: + /* Weak definition. Use this value if we don't find another. */ + if (__builtin_expect (GLRO(dl_dynamic_weak), 0)) + { + if (! result->s) + { + result->s = sym; + result->m = (struct link_map *) map; + } + break; + } + /* FALLTHROUGH */ + case STB_GLOBAL: + success: + /* Global definition. Just what we need. */ + result->s = sym; + result->m = (struct link_map *) map; + return 1; + + case STB_GNU_UNIQUE:; + /* We have to determine whether we already found a + symbol with this name before. If not then we have to + add it to the search table. If we already found a + definition we have to use it. */ + void enter (struct unique_sym *table, size_t size, + unsigned int hash, const char *name, + const ElfW(Sym) *sym, const struct link_map *map) + { + size_t idx = hash % size; + size_t hash2 = 1 + hash % (size - 2); + while (1) + { + if (table[idx].hashval == 0) + { + table[idx].hashval = hash; + table[idx].name = strtab + sym->st_name; + if ((type_class & ELF_RTYPE_CLASS_COPY) != 0) + { + table[idx].sym = ref; + table[idx].map = undef_map; + } + else + { + table[idx].sym = sym; + table[idx].map = map; + } + return; + } + + idx += hash2; + if (idx >= size) + idx -= size; + } + } + + struct unique_sym_table *tab + = &GL(dl_ns)[map->l_ns]._ns_unique_sym_table; + + __rtld_lock_lock_recursive (tab->lock); + + struct unique_sym *entries = tab->entries; + size_t size = tab->size; + if (entries != NULL) + { + size_t idx = new_hash % size; + size_t hash2 = 1 + new_hash % (size - 2); + while (1) + { + if (entries[idx].hashval == new_hash + && strcmp (entries[idx].name, undef_name) == 0) + { + result->s = entries[idx].sym; + result->m = (struct link_map *) entries[idx].map; + __rtld_lock_unlock_recursive (tab->lock); + return 1; + } + + if (entries[idx].hashval == 0 + && entries[idx].name == NULL) + break; + + idx += hash2; + if (idx >= size) + idx -= size; + } + + if (size * 3 <= tab->n_elements) + { + /* Expand the table. */ + size_t newsize = _dl_higher_prime_number (size); + struct unique_sym *newentries + = calloc (sizeof (struct unique_sym), newsize); + if (newentries == NULL) + { + nomem: + __rtld_lock_unlock_recursive (tab->lock); + _dl_fatal_printf ("out of memory\n"); + } + + for (idx = 0; idx < size; ++idx) + if (entries[idx].hashval != 0) + enter (newentries, newsize, entries[idx].hashval, + entries[idx].name, entries[idx].sym, + entries[idx].map); + + tab->free (entries); + tab->size = newsize; + entries = tab->entries = newentries; + tab->free = free; + } + } + else + { +#define INITIAL_NUNIQUE_SYM_TABLE 31 + size = INITIAL_NUNIQUE_SYM_TABLE; + entries = calloc (sizeof (struct unique_sym), size); + if (entries == NULL) + goto nomem; + + tab->entries = entries; + tab->size = size; + tab->free = free; + } + + enter (entries, size, new_hash, strtab + sym->st_name, sym, map); + ++tab->n_elements; + + __rtld_lock_unlock_recursive (tab->lock); + + goto success; + + default: + /* Local symbols are ignored. */ + break; + } + } + + /* If this current map is the one mentioned in the verneed entry + and we have not found a weak entry, it is a bug. */ + if (symidx == STN_UNDEF && version != NULL && version->filename != NULL + && __builtin_expect (_dl_name_match_p (version->filename, map), 0)) + return -1; + } + while (++i < n); + + /* We have not found anything until now. */ + return 0; +} static uint_fast32_t diff --git a/elf/do-lookup.h b/elf/do-lookup.h deleted file mode 100644 index 782f490964..0000000000 --- a/elf/do-lookup.h +++ /dev/null @@ -1,385 +0,0 @@ -/* Look up a symbol in the loaded objects. - Copyright (C) 1995-2007, 2008, 2009 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - - -/* Inner part of the lookup functions. We return a value > 0 if we - found the symbol, the value 0 if nothing is found and < 0 if - something bad happened. */ -static int -__attribute_noinline__ -do_lookup_x (const char *undef_name, uint_fast32_t new_hash, - unsigned long int *old_hash, const ElfW(Sym) *ref, - struct sym_val *result, struct r_scope_elem *scope, size_t i, - const struct r_found_version *const version, int flags, - struct link_map *skip, int type_class, struct link_map *undef_map) -{ - size_t n = scope->r_nlist; - /* Make sure we read the value before proceeding. Otherwise we - might use r_list pointing to the initial scope and r_nlist being - the value after a resize. That is the only path in dl-open.c not - protected by GSCOPE. A read barrier here might be to expensive. */ - __asm volatile ("" : "+r" (n), "+m" (scope->r_list)); - struct link_map **list = scope->r_list; - - do - { - /* These variables are used in the nested function. */ - Elf_Symndx symidx; - int num_versions = 0; - const ElfW(Sym) *versioned_sym = NULL; - - const struct link_map *map = list[i]->l_real; - - /* Here come the extra test needed for `_dl_lookup_symbol_skip'. */ - if (map == skip) - continue; - - /* Don't search the executable when resolving a copy reloc. */ - if ((type_class & ELF_RTYPE_CLASS_COPY) && map->l_type == lt_executable) - continue; - - /* Do not look into objects which are going to be removed. */ - if (map->l_removed) - continue; - - /* Print some debugging info if wanted. */ - if (__builtin_expect (GLRO(dl_debug_mask) & DL_DEBUG_SYMBOLS, 0)) - _dl_debug_printf ("symbol=%s; lookup in file=%s [%lu]\n", - undef_name, - map->l_name[0] ? map->l_name : rtld_progname, - map->l_ns); - - /* If the hash table is empty there is nothing to do here. */ - if (map->l_nbuckets == 0) - continue; - - /* The tables for this map. */ - const ElfW(Sym) *symtab = (const void *) D_PTR (map, l_info[DT_SYMTAB]); - const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]); - - - /* Nested routine to check whether the symbol matches. */ - const ElfW(Sym) * - __attribute_noinline__ - check_match (const ElfW(Sym) *sym) - { - unsigned int stt = ELFW(ST_TYPE) (sym->st_info); - assert (ELF_RTYPE_CLASS_PLT == 1); - if (__builtin_expect ((sym->st_value == 0 /* No value. */ - && stt != STT_TLS) - || (type_class & (sym->st_shndx == SHN_UNDEF)), - 0)) - return NULL; - - /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, - STT_COMMON, STT_TLS, and STT_GNU_IFUNC since these are no - code/data definitions. */ -#define ALLOWED_STT \ - ((1 << STT_NOTYPE) | (1 << STT_OBJECT) | (1 << STT_FUNC) \ - | (1 << STT_COMMON) | (1 << STT_TLS) | (1 << STT_GNU_IFUNC)) - if (__builtin_expect (((1 << stt) & ALLOWED_STT) == 0, 0)) - return NULL; - - if (sym != ref && strcmp (strtab + sym->st_name, undef_name)) - /* Not the symbol we are looking for. */ - return NULL; - - const ElfW(Half) *verstab = map->l_versyms; - if (version != NULL) - { - if (__builtin_expect (verstab == NULL, 0)) - { - /* We need a versioned symbol but haven't found any. If - this is the object which is referenced in the verneed - entry it is a bug in the library since a symbol must - not simply disappear. - - It would also be a bug in the object since it means that - the list of required versions is incomplete and so the - tests in dl-version.c haven't found a problem.*/ - assert (version->filename == NULL - || ! _dl_name_match_p (version->filename, map)); - - /* Otherwise we accept the symbol. */ - } - else - { - /* We can match the version information or use the - default one if it is not hidden. */ - ElfW(Half) ndx = verstab[symidx] & 0x7fff; - if ((map->l_versions[ndx].hash != version->hash - || strcmp (map->l_versions[ndx].name, version->name)) - && (version->hidden || map->l_versions[ndx].hash - || (verstab[symidx] & 0x8000))) - /* It's not the version we want. */ - return NULL; - } - } - else - { - /* No specific version is selected. There are two ways we - can got here: - - - a binary which does not include versioning information - is loaded - - - dlsym() instead of dlvsym() is used to get a symbol which - might exist in more than one form - - If the library does not provide symbol version information - there is no problem at at: we simply use the symbol if it - is defined. - - These two lookups need to be handled differently if the - library defines versions. In the case of the old - unversioned application the oldest (default) version - should be used. In case of a dlsym() call the latest and - public interface should be returned. */ - if (verstab != NULL) - { - if ((verstab[symidx] & 0x7fff) - >= ((flags & DL_LOOKUP_RETURN_NEWEST) ? 2 : 3)) - { - /* Don't accept hidden symbols. */ - if ((verstab[symidx] & 0x8000) == 0 - && num_versions++ == 0) - /* No version so far. */ - versioned_sym = sym; - - return NULL; - } - } - } - - /* There cannot be another entry for this symbol so stop here. */ - return sym; - } - - const ElfW(Sym) *sym; - const ElfW(Addr) *bitmask = map->l_gnu_bitmask; - if (__builtin_expect (bitmask != NULL, 1)) - { - ElfW(Addr) bitmask_word - = bitmask[(new_hash / __ELF_NATIVE_CLASS) - & map->l_gnu_bitmask_idxbits]; - - unsigned int hashbit1 = new_hash & (__ELF_NATIVE_CLASS - 1); - unsigned int hashbit2 = ((new_hash >> map->l_gnu_shift) - & (__ELF_NATIVE_CLASS - 1)); - - if (__builtin_expect ((bitmask_word >> hashbit1) - & (bitmask_word >> hashbit2) & 1, 0)) - { - Elf32_Word bucket = map->l_gnu_buckets[new_hash - % map->l_nbuckets]; - if (bucket != 0) - { - const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket]; - - do - if (((*hasharr ^ new_hash) >> 1) == 0) - { - symidx = hasharr - map->l_gnu_chain_zero; - sym = check_match (&symtab[symidx]); - if (sym != NULL) - goto found_it; - } - while ((*hasharr++ & 1u) == 0); - } - } - /* No symbol found. */ - symidx = SHN_UNDEF; - } - else - { - if (*old_hash == 0xffffffff) - *old_hash = _dl_elf_hash (undef_name); - - /* Use the old SysV-style hash table. Search the appropriate - hash bucket in this object's symbol table for a definition - for the same symbol name. */ - for (symidx = map->l_buckets[*old_hash % map->l_nbuckets]; - symidx != STN_UNDEF; - symidx = map->l_chain[symidx]) - { - sym = check_match (&symtab[symidx]); - if (sym != NULL) - goto found_it; - } - } - - /* If we have seen exactly one versioned symbol while we are - looking for an unversioned symbol and the version is not the - default version we still accept this symbol since there are - no possible ambiguities. */ - sym = num_versions == 1 ? versioned_sym : NULL; - - if (sym != NULL) - { - found_it: - switch (__builtin_expect (ELFW(ST_BIND) (sym->st_info), STB_GLOBAL)) - { - case STB_WEAK: - /* Weak definition. Use this value if we don't find another. */ - if (__builtin_expect (GLRO(dl_dynamic_weak), 0)) - { - if (! result->s) - { - result->s = sym; - result->m = (struct link_map *) map; - } - break; - } - /* FALLTHROUGH */ - case STB_GLOBAL: - success: - /* Global definition. Just what we need. */ - result->s = sym; - result->m = (struct link_map *) map; - return 1; - - case STB_GNU_UNIQUE:; - /* We have to determine whether we already found a - symbol with this name before. If not then we have to - add it to the search table. If we already found a - definition we have to use it. */ - void enter (struct unique_sym *table, size_t size, - unsigned int hash, const char *name, - const ElfW(Sym) *sym, const struct link_map *map) - { - size_t idx = hash % size; - size_t hash2 = 1 + hash % (size - 2); - while (1) - { - if (table[idx].hashval == 0) - { - table[idx].hashval = hash; - table[idx].name = strtab + sym->st_name; - if ((type_class & ELF_RTYPE_CLASS_COPY) != 0) - { - table[idx].sym = ref; - table[idx].map = undef_map; - } - else - { - table[idx].sym = sym; - table[idx].map = map; - } - return; - } - - idx += hash2; - if (idx >= size) - idx -= size; - } - } - - struct unique_sym_table *tab - = &GL(dl_ns)[map->l_ns]._ns_unique_sym_table; - - __rtld_lock_lock_recursive (tab->lock); - - struct unique_sym *entries = tab->entries; - size_t size = tab->size; - if (entries != NULL) - { - size_t idx = new_hash % size; - size_t hash2 = 1 + new_hash % (size - 2); - while (1) - { - if (entries[idx].hashval == new_hash - && strcmp (entries[idx].name, undef_name) == 0) - { - result->s = entries[idx].sym; - result->m = (struct link_map *) entries[idx].map; - __rtld_lock_unlock_recursive (tab->lock); - return 1; - } - - if (entries[idx].hashval == 0 - && entries[idx].name == NULL) - break; - - idx += hash2; - if (idx >= size) - idx -= size; - } - - if (size * 3 <= tab->n_elements) - { - /* Expand the table. */ - size_t newsize = _dl_higher_prime_number (size); - struct unique_sym *newentries - = calloc (sizeof (struct unique_sym), newsize); - if (newentries == NULL) - { - nomem: - __rtld_lock_unlock_recursive (tab->lock); - _dl_fatal_printf ("out of memory\n"); - } - - for (idx = 0; idx < size; ++idx) - if (entries[idx].hashval != 0) - enter (newentries, newsize, entries[idx].hashval, - entries[idx].name, entries[idx].sym, - entries[idx].map); - - tab->free (entries); - tab->size = newsize; - entries = tab->entries = newentries; - tab->free = free; - } - } - else - { -#define INITIAL_NUNIQUE_SYM_TABLE 31 - size = INITIAL_NUNIQUE_SYM_TABLE; - entries = calloc (sizeof (struct unique_sym), size); - if (entries == NULL) - goto nomem; - - tab->entries = entries; - tab->size = size; - tab->free = free; - } - - enter (entries, size, new_hash, strtab + sym->st_name, sym, map); - ++tab->n_elements; - - __rtld_lock_unlock_recursive (tab->lock); - - goto success; - - default: - /* Local symbols are ignored. */ - break; - } - } - - /* If this current map is the one mentioned in the verneed entry - and we have not found a weak entry, it is a bug. */ - if (symidx == STN_UNDEF && version != NULL && version->filename != NULL - && __builtin_expect (_dl_name_match_p (version->filename, map), 0)) - return -1; - } - while (++i < n); - - /* We have not found anything until now. */ - return 0; -} From b0ecde3a63fd3e987137aa9eb76da3b556b14559 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 10 Jul 2009 12:04:14 -0700 Subject: [PATCH 16/50] Add AVX support to ld.so auditing for x86-64. --- ChangeLog | 34 ++++ config.h.in | 3 + config.make.in | 1 + configure | 2 + configure.in | 1 + elf/Makefile | 22 ++- elf/tst-audit4.c | 35 ++++ elf/tst-audit5.c | 21 +++ elf/tst-auditmod4a.c | 48 ++++++ elf/tst-auditmod4b.c | 206 ++++++++++++++++++++++ elf/tst-auditmod5a.c | 46 +++++ elf/tst-auditmod5b.c | 178 +++++++++++++++++++ sysdeps/x86_64/Makefile | 1 + sysdeps/x86_64/bits/link.h | 12 ++ sysdeps/x86_64/dl-trampoline.S | 179 ++++++-------------- sysdeps/x86_64/dl-trampoline.h | 291 ++++++++++++++++++++++++++++++++ sysdeps/x86_64/elf/configure | 25 +++ sysdeps/x86_64/elf/configure.in | 11 ++ sysdeps/x86_64/link-defines.sym | 28 +++ 19 files changed, 1018 insertions(+), 126 deletions(-) create mode 100644 elf/tst-audit4.c create mode 100644 elf/tst-audit5.c create mode 100644 elf/tst-auditmod4a.c create mode 100644 elf/tst-auditmod4b.c create mode 100644 elf/tst-auditmod5a.c create mode 100644 elf/tst-auditmod5b.c create mode 100644 sysdeps/x86_64/dl-trampoline.h create mode 100644 sysdeps/x86_64/link-defines.sym diff --git a/ChangeLog b/ChangeLog index 04760b065c..e90d19fc78 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,37 @@ +2009-07-07 H.J. Lu + + * config.h.in: Add HAVE_AVX_SUPPORT entry. + * config.make.in: Add config-cflags-avx entry. + * configure.in: Substitute libc_cv_cc_avx. + * elf/Makefile: Add rules to build and run tst-audit4 and tst-audit5. + * elf/tst-audit4.c: New file. + * elf/tst-audit5.c: New file. + * elf/tst-auditmod4a.c: New file. + * elf/tst-auditmod4b.c: New file. + * elf/tst-auditmod5a.c: New file. + * elf/tst-auditmod5b.c: New file. + * sysdeps/x86_64/Makefile (gen-as-const-headers): Add + link-defines.sym. + * sysdeps/x86_64/bits/link.h (La_x86_64_ymm): New. + (La_x86_64_vector): Likewise. + (La_x86_64_regs): Append lr_vector. + (La_x86_64_retval): Append lr_vector0/lrv_vector1. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Move + saving and restoring SSE registers to ... + * sysdeps/x86_64/dl-trampoline.h: This. New file. + * sysdeps/x86_64/dl-trampoline.S: Include and + . + (_dl_runtime_profile): Use LR_SIZE to allocate space for + La_x86_64_regs. Allocate extra space and jump to memory at + save_and_restore_vector if HAVE_AVX_SUPPORT is defined. + (save_and_restore_vector_sse): New. + (save_and_restore_vector_avx): Likewise. + (check_avx): Likewise. + (save_and_restore_vector): Likewise. + * sysdeps/x86_64/elf/configure.in: Set libc_cv_cc_avx and + HAVE_AVX_SUPPORT. + * sysdeps/x86_64/link-defines.sym: New file. + 2009-07-10 Ulrich Drepper * elf/do-lookup.h: Removed after folding content into... diff --git a/config.h.in b/config.h.in index 5f16874584..18bf01a38c 100644 --- a/config.h.in +++ b/config.h.in @@ -135,6 +135,9 @@ /* Define if gcc supports SSE4. */ #undef HAVE_SSE4_SUPPORT +/* Define if gcc supports AVX. */ +#undef HAVE_AVX_SUPPORT + /* Define if the compiler's exception support is based on libunwind. */ #undef HAVE_CC_WITH_LIBUNWIND diff --git a/config.make.in b/config.make.in index 5fb5c8110c..d65706ceac 100644 --- a/config.make.in +++ b/config.make.in @@ -35,6 +35,7 @@ cflags-cpu = @libc_cv_cc_submachine@ asflags-cpu = @libc_cv_cc_submachine@ config-cflags-sse4 = @libc_cv_cc_sse4@ +config-cflags-avx = @libc_cv_cc_avx@ defines = @DEFINES@ sysincludes = @SYSINCLUDES@ diff --git a/configure b/configure index 4e49f7068b..48e6952bbd 100755 --- a/configure +++ b/configure @@ -657,6 +657,7 @@ xcoff elf ldd_rewrite_script use_ldconfig +libc_cv_cc_avx libc_cv_cc_sse4 libc_cv_cpp_asm_debuginfo libc_cv_forced_unwind @@ -8772,6 +8773,7 @@ fi + if test $elf = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_ELF 1 diff --git a/configure.in b/configure.in index 61c87418e6..4584afe605 100644 --- a/configure.in +++ b/configure.in @@ -2277,6 +2277,7 @@ AC_SUBST(libc_cv_forced_unwind) dnl sysdeps/CPU/configure.in checks set this via arch-specific asm tests AC_SUBST(libc_cv_cpp_asm_debuginfo) AC_SUBST(libc_cv_cc_sse4) +AC_SUBST(libc_cv_cc_avx) AC_SUBST(use_ldconfig) AC_SUBST(ldd_rewrite_script) diff --git a/elf/Makefile b/elf/Makefile index 3e656ae0ad..e4b977e9e3 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -89,8 +89,10 @@ distribute := rtld-Rules \ unload4mod1.c unload4mod2.c unload4mod3.c unload4mod4.c \ unload6mod1.c unload6mod2.c unload6mod3.c \ unload7mod1.c unload7mod2.c \ - tst-audit1.c tst-audit2.c tst-audit3.c \ + tst-audit1.c tst-audit2.c tst-audit3.c tst-audit4.c \ tst-auditmod1.c tst-auditmod3a.c tst-auditmod3b.c \ + tst-auditmod4a.c tst-auditmod4b.c \ + tst-audit5.c tst-auditmod5a.c tst-auditmod5b.c \ order2mod1.c order2mod2.c order2mod3.c order2mod4.c \ tst-stackguard1.c tst-stackguard1-static.c \ tst-array5.c tst-array5-static.c tst-array5dep.c \ @@ -198,7 +200,7 @@ tests += loadtest restest1 preloadtest loadfail multiload origtest resolvfail \ test-srcs = tst-pathopt tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog ifeq (x86_64,$(config-machine)) -tests += tst-audit3 +tests += tst-audit3 tst-audit4 tst-audit5 endif endif ifeq (yesyes,$(have-fpie)$(build-shared)) @@ -238,6 +240,8 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \ tst-dlopenrpathmod tst-deep1mod1 tst-deep1mod2 tst-deep1mod3 \ tst-dlmopen1mod tst-auditmod1 \ tst-auditmod3a tst-auditmod3b \ + tst-auditmod4a tst-auditmod4b \ + tst-auditmod5a tst-auditmod5b \ unload3mod1 unload3mod2 unload3mod3 unload3mod4 \ unload4mod1 unload4mod2 unload4mod3 unload4mod4 \ unload6mod1 unload6mod2 unload6mod3 \ @@ -973,6 +977,14 @@ $(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so $(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so +$(objpfx)tst-audit4: $(objpfx)tst-auditmod4a.so +$(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so +tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so + +$(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so +$(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so +tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so + $(objpfx)tst-global1: $(libdl) $(objpfx)tst-global1.out: $(objpfx)testobj6.so $(objpfx)testobj2.so @@ -1115,3 +1127,9 @@ $(objpfx)tst-unique1.out: $(objpfx)tst-unique1mod1.so \ $(objpfx)tst-unique2: $(libdl) $(objpfx)tst-unique2mod1.so $(objpfx)tst-unique2.out: $(objpfx)tst-unique2mod2.so + +ifeq (yes,$(config-cflags-avx)) +CFLAGS-tst-audit4.c += -mavx +CFLAGS-tst-auditmod4a.c += -mavx +CFLAGS-tst-auditmod4b.c += -mavx +endif diff --git a/elf/tst-audit4.c b/elf/tst-audit4.c new file mode 100644 index 0000000000..b17d4a61a7 --- /dev/null +++ b/elf/tst-audit4.c @@ -0,0 +1,35 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#ifdef __AVX__ +#include +#include +#include +#include + +extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i, + __m256i, __m256i, __m256i, __m256i); +int +main (void) +{ + unsigned int eax, ebx, ecx, edx; + + /* Run AVX test only if AVX is supported. */ + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + { + __m256i ymm = _mm256_setzero_si256 (); + __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm); + + ymm = _mm256_set1_epi32 (0x12349876); + if (memcmp (&ymm, &ret, sizeof (ret))) + abort (); + } + return 0; +} +#else +int +main (void) +{ + return 0; +} +#endif diff --git a/elf/tst-audit5.c b/elf/tst-audit5.c new file mode 100644 index 0000000000..0094fee61f --- /dev/null +++ b/elf/tst-audit5.c @@ -0,0 +1,21 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include +#include + +#include + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); +int +main (void) +{ + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + xmm = _mm_set1_epi32 (0x12349876); + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + + return 0; +} diff --git a/elf/tst-auditmod4a.c b/elf/tst-auditmod4a.c new file mode 100644 index 0000000000..c9c24c04a8 --- /dev/null +++ b/elf/tst-auditmod4a.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#ifdef __AVX__ +#include +#include +#include + +__m256i +audit_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3, + __m256i x4, __m256i x5, __m256i x6, __m256i x7) +{ + __m256i ymm; + + ymm = _mm256_set1_epi32 (1); + if (memcmp (&ymm, &x0, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (2); + if (memcmp (&ymm, &x1, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (3); + if (memcmp (&ymm, &x2, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (4); + if (memcmp (&ymm, &x3, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (5); + if (memcmp (&ymm, &x4, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (6); + if (memcmp (&ymm, &x5, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (7); + if (memcmp (&ymm, &x6, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (8); + if (memcmp (&ymm, &x7, sizeof (ymm))) + abort (); + + return _mm256_setzero_si256 (); +} +#endif diff --git a/elf/tst-auditmod4b.c b/elf/tst-auditmod4b.c new file mode 100644 index 0000000000..a6d3c6a6c5 --- /dev/null +++ b/elf/tst-auditmod4b.c @@ -0,0 +1,206 @@ +/* Verify that changing AVX registers in audit library won't affect + function parameter passing/return. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#define pltenter la_x86_64_gnu_pltenter +#define pltexit la_x86_64_gnu_pltexit +#define La_regs La_x86_64_regs +#define La_retval La_x86_64_retval +#define int_retval lrv_rax + +#include + +#ifdef __AVX__ +#include +#include + +static int avx = -1; + +static int +__attribute ((always_inline)) +check_avx (void) +{ + if (avx == -1) + { + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + avx = 1; + else + avx = 0; + } + return avx; +} +#else +#include +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + __m256i zero = _mm256_setzero_si256 (); + if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_vector[i].ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (i + 1); + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" ); + asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" ); + asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" ); + asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" ); + asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" ); + asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, outregs->int_retval); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + __m256i zero = _mm256_setzero_si256 (); + if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + __m256i ymm = _mm256_set1_epi32 (i + 1); + if (memcmp (&inregs->lr_vector[i], &ymm, sizeof (ymm)) != 0) + abort (); + } + + outregs->lrv_vector0.ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876); + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/elf/tst-auditmod5a.c b/elf/tst-auditmod5a.c new file mode 100644 index 0000000000..8511a70747 --- /dev/null +++ b/elf/tst-auditmod5a.c @@ -0,0 +1,46 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include +#include +#include + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm; + + xmm = _mm_set1_epi32 (1); + if (memcmp (&xmm, &x0, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (2); + if (memcmp (&xmm, &x1, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (3); + if (memcmp (&xmm, &x2, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (4); + if (memcmp (&xmm, &x3, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (5); + if (memcmp (&xmm, &x4, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (6); + if (memcmp (&xmm, &x5, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (7); + if (memcmp (&xmm, &x6, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (8); + if (memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return _mm_setzero_si128 (); +} diff --git a/elf/tst-auditmod5b.c b/elf/tst-auditmod5b.c new file mode 100644 index 0000000000..7e1e941126 --- /dev/null +++ b/elf/tst-auditmod5b.c @@ -0,0 +1,178 @@ +/* Verify that changing xmm registers in audit library won't affect + function parameter passing/return. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#define pltenter la_x86_64_gnu_pltenter +#define pltexit la_x86_64_gnu_pltexit +#define La_regs La_x86_64_regs +#define La_retval La_x86_64_retval +#define int_retval lrv_rax + +#include + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + __m128i minusone = _mm_set1_epi32 (-1); + + if (strcmp (symname, "audit_test") == 0) + { + __m128i zero = _mm_setzero_si128 (); + if (memcmp (®s->lr_xmm[0], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[1], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[2], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[3], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[4], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[5], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[6], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1); + + *framesizep = 1024; + } + + asm volatile ("movdqa %0, %%xmm0" : : "x" (minusone) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (minusone) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (minusone) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (minusone) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (minusone) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (minusone) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (minusone) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (minusone) : "xmm7" ); + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, outregs->int_retval); + + __m128i xmm; + + if (strcmp (symname, "audit_test") == 0) + { + __m128i zero = _mm_setzero_si128 (); + if (memcmp (&outregs->lrv_xmm0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + xmm = _mm_set1_epi32 (i + 1); + if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) != 0) + abort (); + } + + outregs->lrv_xmm0 = (La_x86_64_xmm) _mm_set1_epi32 (0x12349876); + } + + xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + + return 0; +} diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index da82093381..78fdb04fcb 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -4,6 +4,7 @@ long-double-fcts = yes ifeq ($(subdir),csu) sysdep_routines += hp-timing elide-routines.os += hp-timing +gen-as-const-headers += link-defines.sym endif ifeq ($(subdir),gmon) diff --git a/sysdeps/x86_64/bits/link.h b/sysdeps/x86_64/bits/link.h index 5676b78753..643a293bb0 100644 --- a/sysdeps/x86_64/bits/link.h +++ b/sysdeps/x86_64/bits/link.h @@ -65,10 +65,19 @@ __END_DECLS /* Registers for entry into PLT on x86-64. */ # if __GNUC_PREREQ (4,0) typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16))); +typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32))); # else typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__))); # endif +typedef union +{ +# if __GNUC_PREREQ (4,0) + La_x86_64_ymm ymm[2]; +# endif + La_x86_64_xmm xmm[4]; +} La_x86_64_vector __attribute__ ((aligned(16))); + typedef struct La_x86_64_regs { uint64_t lr_rdx; @@ -80,6 +89,7 @@ typedef struct La_x86_64_regs uint64_t lr_rbp; uint64_t lr_rsp; La_x86_64_xmm lr_xmm[8]; + La_x86_64_vector lr_vector[8]; } La_x86_64_regs; /* Return values for calls from PLT on x86-64. */ @@ -91,6 +101,8 @@ typedef struct La_x86_64_retval La_x86_64_xmm lrv_xmm1; long double lrv_st0; long double lrv_st1; + La_x86_64_vector lrv_vector0; + La_x86_64_vector lrv_vector1; } La_x86_64_retval; diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 33e6115f7b..f605351f30 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -17,7 +17,9 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#include #include +#include .text .globl _dl_runtime_resolve @@ -89,135 +91,64 @@ _dl_runtime_profile: /* Actively align the La_x86_64_regs structure. */ andq $0xfffffffffffffff0, %rsp - subq $192, %rsp # sizeof(La_x86_64_regs) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers + to detect if any xmm0-xmm7 registers are changed by audit + module. */ + subq $(LR_SIZE + XMM_SIZE*8), %rsp +#else + subq $LR_SIZE, %rsp # sizeof(La_x86_64_regs) +#endif movq %rsp, 24(%rbx) - movq %rdx, (%rsp) # Fill the La_x86_64_regs structure. - movq %r8, 8(%rsp) - movq %r9, 16(%rsp) - movq %rcx, 24(%rsp) - movq %rsi, 32(%rsp) - movq %rdi, 40(%rsp) - movq %rbp, 48(%rsp) - leaq 48(%rbx), %rax - movq %rax, 56(%rsp) - movaps %xmm0, 64(%rsp) - movaps %xmm1, 80(%rsp) - movaps %xmm2, 96(%rsp) - movaps %xmm3, 112(%rsp) - movaps %xmm4, 128(%rsp) - movaps %xmm5, 144(%rsp) - movaps %xmm6, 160(%rsp) - movaps %xmm7, 176(%rsp) - - movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. - movq 48(%rbx), %rdx # Load return address if needed. - movq 40(%rbx), %rsi # Copy args pushed by PLT in register. - movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index - leaq 16(%rbx), %r8 - call _dl_profile_fixup # Call resolver. - - movq %rax, %r11 # Save return value. - - movq 8(%rbx), %rax # Get back register content. - movq (%rsp), %rdx - movq 8(%rsp), %r8 - movq 16(%rsp), %r9 - movaps 64(%rsp), %xmm0 - movaps 80(%rsp), %xmm1 - movaps 96(%rsp), %xmm2 - movaps 112(%rsp), %xmm3 - movaps 128(%rsp), %xmm4 - movaps 144(%rsp), %xmm5 - movaps 160(%rsp), %xmm6 - movaps 176(%rsp), %xmm7 - - movq 16(%rbx), %r10 # Anything in framesize? - testq %r10, %r10 - jns 1f - - /* There's nothing in the frame size, so there - will be no call to the _dl_call_pltexit. */ - - movq 24(%rsp), %rcx # Get back registers content. - movq 32(%rsp), %rsi - movq 40(%rsp), %rdi - - movq %rbx, %rsp - movq (%rsp), %rbx - cfi_restore(rbx) - cfi_def_cfa_register(%rsp) - - addq $48, %rsp # Adjust the stack to the return value - # (eats the reloc index and link_map) - cfi_adjust_cfa_offset(-48) - jmp *%r11 # Jump to function address. + /* Fill the La_x86_64_regs structure. */ + movq %rdx, LR_RDX_OFFSET(%rsp) + movq %r8, LR_R8_OFFSET(%rsp) + movq %r9, LR_R9_OFFSET(%rsp) + movq %rcx, LR_RCX_OFFSET(%rsp) + movq %rsi, LR_RSI_OFFSET(%rsp) + movq %rdi, LR_RDI_OFFSET(%rsp) + movq %rbp, LR_RBP_OFFSET(%rsp) -1: - cfi_adjust_cfa_offset(48) - cfi_rel_offset(%rbx, 0) - cfi_def_cfa_register(%rbx) +# ifdef HAVE_AVX_SUPPORT + jmp *L(save_and_restore_vector)(%rip) - /* At this point we need to prepare new stack for the function - which has to be called. We copy the original stack to a - temporary buffer of the size specified by the 'framesize' - returned from _dl_profile_fixup */ - - leaq 56(%rbx), %rsi # stack - addq $8, %r10 - andq $0xfffffffffffffff0, %r10 - movq %r10, %rcx - subq %r10, %rsp - movq %rsp, %rdi - shrq $3, %rcx - rep - movsq - - movq 24(%rdi), %rcx # Get back register content. - movq 32(%rdi), %rsi - movq 40(%rdi), %rdi - - call *%r11 - - mov 24(%rbx), %rsp # Drop the copied stack content - - /* Now we have to prepare the La_x86_64_retval structure for the - _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, - so we just need to allocate the sizeof(La_x86_64_retval) space on - the stack, since the alignment has already been taken care of. */ - - subq $80, %rsp # sizeof(La_x86_64_retval) - movq %rsp, %rcx # La_x86_64_retval argument to %rcx. - - movq %rax, (%rcx) # Fill in the La_x86_64_retval structure. - movq %rdx, 8(%rcx) - movaps %xmm0, 16(%rcx) - movaps %xmm1, 32(%rcx) - fstpt 48(%rcx) - fstpt 64(%rcx) - - movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. - movq 40(%rbx), %rsi # Copy args pushed by PLT in register. - movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index - call _dl_call_pltexit - - movq (%rsp), %rax # Restore return registers. - movq 8(%rsp), %rdx - movaps 16(%rsp), %xmm0 - movaps 32(%rsp), %xmm1 - fldt 64(%rsp) - fldt 48(%rsp) - - movq %rbx, %rsp - movq (%rsp), %rbx - cfi_restore(rbx) - cfi_def_cfa_register(%rsp) - - addq $48, %rsp # Adjust the stack to the return value - # (eats the reloc index and link_map) - cfi_adjust_cfa_offset(-48) - retq + .align 16 +L(save_and_restore_vector_sse): +# endif + +# define MOVXMM movaps +# include "dl-trampoline.h" + +# ifdef HAVE_AVX_SUPPORT +# undef MOVXMM +# define MOVXMM vmovdqa +# define RESTORE_AVX + .align 16 +L(save_and_restore_vector_avx): +# include "dl-trampoline.h" +# endif cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile + +# ifdef HAVE_AVX_SUPPORT +L(check_avx): + mov %rbx,%r11 # Save rbx + movl $1, %eax + cpuid + mov %r11,%rbx # Restore rbx + leaq L(save_and_restore_vector_sse)(%rip), %rax + andl $(1 << 28), %ecx # Check if AVX is available. + jz L(ret) + leaq L(save_and_restore_vector_avx)(%rip), %rax +L(ret): + movq %rax,L(save_and_restore_vector)(%rip) + jmp *%rax + + .section .data.rel.local,"aw",@progbits + .align 8 +L(save_and_restore_vector): + .quad L(check_avx) +# endif #endif diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h new file mode 100644 index 0000000000..d63b7d03c4 --- /dev/null +++ b/sysdeps/x86_64/dl-trampoline.h @@ -0,0 +1,291 @@ +/* Partial PLT profile trampoline to save and restore x86-64 vector + registers. + Copyright (C) 2009 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + leaq 48(%rbx), %rax + movq %rax, LR_RSP_OFFSET(%rsp) + + /* This is to provide backward binary compatility for existing + audit modules. */ + MOVXMM %xmm0, (LR_XMM_OFFSET)(%rsp) + MOVXMM %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + MOVXMM %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + MOVXMM %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + MOVXMM %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + MOVXMM %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + MOVXMM %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + MOVXMM %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + +#ifdef RESTORE_AVX + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) + vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + + /* Save xmm0-xmm7 registers to detect if any of them are + changed by audit module. */ + vmovdqa %xmm0, (LR_SIZE)(%rsp) + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) + vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) + vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) + vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) + vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) + vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) + vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) +#endif + + movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. + movq 48(%rbx), %rdx # Load return address if needed. + movq 40(%rbx), %rsi # Copy args pushed by PLT in register. + movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index + leaq 16(%rbx), %r8 + call _dl_profile_fixup # Call resolver. + + movq %rax, %r11 # Save return value. + + movq 8(%rbx), %rax # Get back register content. + movq LR_RDX_OFFSET(%rsp), %rdx + movq LR_R8_OFFSET(%rsp), %r8 + movq LR_R9_OFFSET(%rsp), %r9 + +#ifdef RESTORE_AVX + /* Check if any xmm0-xmm7 registers are changed by audit + module. */ + vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 1f + + /* We restore AVX registers only if xmm0-xmm7 registers are + unchanged. */ + vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 + jmp 2f + +1: + vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + +2: +#else + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 +#endif + + movq 16(%rbx), %r10 # Anything in framesize? + testq %r10, %r10 + jns 3f + + /* There's nothing in the frame size, so there + will be no call to the _dl_call_pltexit. */ + + /* Get back registers content. */ + movq LR_RCX_OFFSET(%rsp), %rcx + movq LR_RSI_OFFSET(%rsp), %rsi + movq LR_RDI_OFFSET(%rsp), %rdi + + movq %rbx, %rsp + movq (%rsp), %rbx + cfi_restore(rbx) + cfi_def_cfa_register(%rsp) + + addq $48, %rsp # Adjust the stack to the return value + # (eats the reloc index and link_map) + cfi_adjust_cfa_offset(-48) + jmp *%r11 # Jump to function address. + +3: + cfi_adjust_cfa_offset(48) + cfi_rel_offset(%rbx, 0) + cfi_def_cfa_register(%rbx) + + /* At this point we need to prepare new stack for the function + which has to be called. We copy the original stack to a + temporary buffer of the size specified by the 'framesize' + returned from _dl_profile_fixup */ + + leaq LR_RSP_OFFSET(%rbx), %rsi # stack + addq $8, %r10 + andq $0xfffffffffffffff0, %r10 + movq %r10, %rcx + subq %r10, %rsp + movq %rsp, %rdi + shrq $3, %rcx + rep + movsq + + movq 24(%rdi), %rcx # Get back register content. + movq 32(%rdi), %rsi + movq 40(%rdi), %rdi + + call *%r11 + + mov 24(%rbx), %rsp # Drop the copied stack content + + /* Now we have to prepare the La_x86_64_retval structure for the + _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, + so we just need to allocate the sizeof(La_x86_64_retval) space on + the stack, since the alignment has already been taken care of. */ +#ifdef RESTORE_AVX + /* sizeof(La_x86_64_retval). Need extra space for 2 SSE + registers to detect if xmm0/xmm1 registers are changed + by audit module. */ + subq $(LRV_SIZE + XMM_SIZE*2), %rsp +#else + subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval) +#endif + movq %rsp, %rcx # La_x86_64_retval argument to %rcx. + + /* Fill in the La_x86_64_retval structure. */ + movq %rax, LRV_RAX_OFFSET(%rcx) + movq %rdx, LRV_RDX_OFFSET(%rcx) + + MOVXMM %xmm0, LRV_XMM0_OFFSET(%rcx) + MOVXMM %xmm1, LRV_XMM1_OFFSET(%rcx) + +#ifdef RESTORE_AVX + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) + vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) + + /* Save xmm0/xmm1 registers to detect if they are changed + by audit module. */ + vmovdqa %xmm0, (LRV_SIZE)(%rcx) + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) +#endif + + fstpt LRV_ST0_OFFSET(%rcx) + fstpt LRV_ST1_OFFSET(%rcx) + + movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. + movq 40(%rbx), %rsi # Copy args pushed by PLT in register. + movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index + call _dl_call_pltexit + + /* Restore return registers. */ + movq LRV_RAX_OFFSET(%rsp), %rax + movq LRV_RDX_OFFSET(%rsp), %rdx + +#ifdef RESTORE_AVX + /* Check if xmm0/xmm1 registers are changed by audit module. */ + vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 4f + + /* We restore AVX registers only if xmm0/xmm1 registers are + unchanged. */ + vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm0 + vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + jne 4f + + vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 + vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 + jmp 5f + +4: + vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 + vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 +5: +#else + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 +#endif + + fldt LRV_ST0_OFFSET(%rsp) + fldt LRV_ST0_OFFSET(%rsp) + + movq %rbx, %rsp + movq (%rsp), %rbx + cfi_restore(rbx) + cfi_def_cfa_register(%rsp) + + addq $48, %rsp # Adjust the stack to the return value + # (eats the reloc index and link_map) + cfi_adjust_cfa_offset(-48) + retq diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure index 774654997d..221e74c2b8 100755 --- a/sysdeps/x86_64/elf/configure +++ b/sysdeps/x86_64/elf/configure @@ -79,3 +79,28 @@ cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF + +{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 +$as_echo_n "checking for AVX support... " >&6; } +if test "${libc_cv_cc_avx+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 +$as_echo "$libc_cv_cc_avx" >&6; } +if test $libc_cv_cc_avx = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_AVX_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in index 9cb59d009c..14d1875302 100644 --- a/sysdeps/x86_64/elf/configure.in +++ b/sysdeps/x86_64/elf/configure.in @@ -32,3 +32,14 @@ fi dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) + +dnl Check if -mavx works. +AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl +if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi]) +if test $libc_cv_cc_avx = yes; then + AC_DEFINE(HAVE_AVX_SUPPORT) +fi diff --git a/sysdeps/x86_64/link-defines.sym b/sysdeps/x86_64/link-defines.sym new file mode 100644 index 0000000000..1694d883ad --- /dev/null +++ b/sysdeps/x86_64/link-defines.sym @@ -0,0 +1,28 @@ +#include "link.h" +#include + +-- +VECTOR_SIZE sizeof (La_x86_64_vector) +XMM_SIZE sizeof (La_x86_64_xmm) + +LR_SIZE sizeof (struct La_x86_64_regs) +LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx) +LR_R8_OFFSET offsetof (struct La_x86_64_regs, lr_r8) +LR_R9_OFFSET offsetof (struct La_x86_64_regs, lr_r9) +LR_RCX_OFFSET offsetof (struct La_x86_64_regs, lr_rcx) +LR_RSI_OFFSET offsetof (struct La_x86_64_regs, lr_rsi) +LR_RDI_OFFSET offsetof (struct La_x86_64_regs, lr_rdi) +LR_RBP_OFFSET offsetof (struct La_x86_64_regs, lr_rbp) +LR_RSP_OFFSET offsetof (struct La_x86_64_regs, lr_rsp) +LR_XMM_OFFSET offsetof (struct La_x86_64_regs, lr_xmm) +LR_VECTOR_OFFSET offsetof (struct La_x86_64_regs, lr_vector) + +LRV_SIZE sizeof (struct La_x86_64_retval) +LRV_RAX_OFFSET offsetof (struct La_x86_64_retval, lrv_rax) +LRV_RDX_OFFSET offsetof (struct La_x86_64_retval, lrv_rdx) +LRV_XMM0_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm0) +LRV_XMM1_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm1) +LRV_ST0_OFFSET offsetof (struct La_x86_64_retval, lrv_st0) +LRV_ST1_OFFSET offsetof (struct La_x86_64_retval, lrv_st1) +LRV_VECTOR0_OFFSET offsetof (struct La_x86_64_retval, lrv_vector0) +LRV_VECTOR1_OFFSET offsetof (struct La_x86_64_retval, lrv_vector1) From 59cbcac015cdd446c346cfd2c2ada3f94ef540b2 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 15 Jul 2009 08:27:19 -0700 Subject: [PATCH 17/50] Fix build issue with modules for audit test on machines != x86-64. --- ChangeLog | 5 +++++ elf/Makefile | 8 +++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index e90d19fc78..bece41b7a2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-07 H.J. Lu + + * elf/Makefile: Don't build modules for tst-audit4 and tst-audit5 + for anything but x86-64 targets. + 2009-07-07 H.J. Lu * config.h.in: Add HAVE_AVX_SUPPORT entry. diff --git a/elf/Makefile b/elf/Makefile index e4b977e9e3..21d131ec92 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -239,9 +239,6 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \ $(modules-execstack-$(have-z-execstack)) \ tst-dlopenrpathmod tst-deep1mod1 tst-deep1mod2 tst-deep1mod3 \ tst-dlmopen1mod tst-auditmod1 \ - tst-auditmod3a tst-auditmod3b \ - tst-auditmod4a tst-auditmod4b \ - tst-auditmod5a tst-auditmod5b \ unload3mod1 unload3mod2 unload3mod3 unload3mod4 \ unload4mod1 unload4mod2 unload4mod3 unload4mod4 \ unload6mod1 unload6mod2 unload6mod3 \ @@ -255,6 +252,11 @@ endif ifeq (yesyes,$(have-fpie)$(build-shared)) modules-names += tst-piemod1 endif +ifeq (x86_64,$(config-machine)) +modules-names += tst-auditmod3a tst-auditmod3b \ + tst-auditmod4a tst-auditmod4b \ + tst-auditmod5a tst-auditmod5b +endif modules-execstack-yes = tst-execstack-mod extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) # We need this variable to be sure the test modules get the right CPPFLAGS. From d7bd7a8ae8cdb3f1414b1e032759d9ef324eb040 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 15 Jul 2009 17:41:36 -0700 Subject: [PATCH 18/50] Secure AVX changes for auditing code. The original AVX patch used a function pointer to handle the difference between machines with and without AVX support. This is insecure. A well-placed memory exploit could lead to redirection of the execution. Using a variable and several tests is a bit slower but cannot be exploited in this way. --- ChangeLog | 7 +- sysdeps/x86_64/dl-trampoline.S | 327 +++++++++++++++++++++++++++++---- sysdeps/x86_64/dl-trampoline.h | 291 ----------------------------- 3 files changed, 299 insertions(+), 326 deletions(-) delete mode 100644 sysdeps/x86_64/dl-trampoline.h diff --git a/ChangeLog b/ChangeLog index bece41b7a2..c355ea4be1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,8 @@ -2009-07-07 H.J. Lu +2009-07-15 Ulrich Drepper - * elf/Makefile: Don't build modules for tst-audit4 and tst-audit5 - for anything but x86-64 targets. + * sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into... + * sysdeps/x86-64/dl-trampoline.S: ...here. Rewrite to avoid function + pointers in writable memory. 2009-07-07 H.J. Lu diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index f605351f30..2f55639662 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -96,9 +96,9 @@ _dl_runtime_profile: to detect if any xmm0-xmm7 registers are changed by audit module. */ subq $(LR_SIZE + XMM_SIZE*8), %rsp -#else +# else subq $LR_SIZE, %rsp # sizeof(La_x86_64_regs) -#endif +# endif movq %rsp, 24(%rbx) /* Fill the La_x86_64_regs structure. */ @@ -110,45 +110,308 @@ _dl_runtime_profile: movq %rdi, LR_RDI_OFFSET(%rsp) movq %rbp, LR_RBP_OFFSET(%rsp) + leaq 48(%rbx), %rax + movq %rax, LR_RSP_OFFSET(%rsp) + + /* We always store the XMM registers even if AVX is available. + This is to provide backward binary compatility for existing + audit modules. */ + movaps %xmm0, (LR_XMM_OFFSET)(%rsp) + movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + # ifdef HAVE_AVX_SUPPORT - jmp *L(save_and_restore_vector)(%rip) + .data +L(have_avx): + .zero 4 + .size L(have_avx), 4 + .previous - .align 16 -L(save_and_restore_vector_sse): + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_eax)(%rip) + cmpl $0, %eax + +1: js L(no_avx1) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) + vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + + /* Save xmm0-xmm7 registers to detect if any of them are + changed by audit module. */ + vmovdqa %xmm0, (LR_SIZE)(%rsp) + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) + vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) + vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) + vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) + vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) + vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) + vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) + +L(no_avx1): # endif -# define MOVXMM movaps -# include "dl-trampoline.h" + movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. + movq 48(%rbx), %rdx # Load return address if needed. + movq 40(%rbx), %rsi # Copy args pushed by PLT in register. + movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index + leaq 16(%rbx), %r8 + call _dl_profile_fixup # Call resolver. + + movq %rax, %r11 # Save return value. + + movq 8(%rbx), %rax # Get back register content. + movq LR_RDX_OFFSET(%rsp), %rdx + movq LR_R8_OFFSET(%rsp), %r8 + movq LR_R9_OFFSET(%rsp), %r9 # ifdef HAVE_AVX_SUPPORT -# undef MOVXMM -# define MOVXMM vmovdqa -# define RESTORE_AVX - .align 16 -L(save_and_restore_vector_avx): -# include "dl-trampoline.h" + cmpl $0, L(have_avx)(%rip) + js L(no_avx2) + + /* Check if any xmm0-xmm7 registers are changed by audit + module. */ + vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm3 + vpmovmskb %xmm3, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm4 + vpmovmskb %xmm4, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm5 + vpmovmskb %xmm5, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm6 + vpmovmskb %xmm6, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm7 + vpmovmskb %xmm7, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 + +1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 + jmp 1f + +L(no_avx2): + vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + +1: +# else + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 # endif - cfi_endproc - .size _dl_runtime_profile, .-_dl_runtime_profile + movq 16(%rbx), %r10 # Anything in framesize? + testq %r10, %r10 + jns 3f + + /* There's nothing in the frame size, so there + will be no call to the _dl_call_pltexit. */ + + /* Get back registers content. */ + movq LR_RCX_OFFSET(%rsp), %rcx + movq LR_RSI_OFFSET(%rsp), %rsi + movq LR_RDI_OFFSET(%rsp), %rdi + movq %rbx, %rsp + movq (%rsp), %rbx + cfi_restore(rbx) + cfi_def_cfa_register(%rsp) + + addq $48, %rsp # Adjust the stack to the return value + # (eats the reloc index and link_map) + cfi_adjust_cfa_offset(-48) + jmp *%r11 # Jump to function address. + +3: + cfi_adjust_cfa_offset(48) + cfi_rel_offset(%rbx, 0) + cfi_def_cfa_register(%rbx) + + /* At this point we need to prepare new stack for the function + which has to be called. We copy the original stack to a + temporary buffer of the size specified by the 'framesize' + returned from _dl_profile_fixup */ + + leaq LR_RSP_OFFSET(%rbx), %rsi # stack + addq $8, %r10 + andq $0xfffffffffffffff0, %r10 + movq %r10, %rcx + subq %r10, %rsp + movq %rsp, %rdi + shrq $3, %rcx + rep + movsq + + movq 24(%rdi), %rcx # Get back register content. + movq 32(%rdi), %rsi + movq 40(%rdi), %rdi + + call *%r11 + + mov 24(%rbx), %rsp # Drop the copied stack content + + /* Now we have to prepare the La_x86_64_retval structure for the + _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, + so we just need to allocate the sizeof(La_x86_64_retval) space on + the stack, since the alignment has already been taken care of. */ # ifdef HAVE_AVX_SUPPORT -L(check_avx): - mov %rbx,%r11 # Save rbx - movl $1, %eax - cpuid - mov %r11,%rbx # Restore rbx - leaq L(save_and_restore_vector_sse)(%rip), %rax - andl $(1 << 28), %ecx # Check if AVX is available. - jz L(ret) - leaq L(save_and_restore_vector_avx)(%rip), %rax -L(ret): - movq %rax,L(save_and_restore_vector)(%rip) - jmp *%rax - - .section .data.rel.local,"aw",@progbits - .align 8 -L(save_and_restore_vector): - .quad L(check_avx) + /* sizeof(La_x86_64_retval). Need extra space for 2 SSE + registers to detect if xmm0/xmm1 registers are changed + by audit module. */ + subq $(LRV_SIZE + XMM_SIZE*2), %rsp +# else + subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval) +# endif + movq %rsp, %rcx # La_x86_64_retval argument to %rcx. + + /* Fill in the La_x86_64_retval structure. */ + movq %rax, LRV_RAX_OFFSET(%rcx) + movq %rdx, LRV_RDX_OFFSET(%rcx) + + movaps %xmm0, LRV_XMM0_OFFSET(%rcx) + movaps %xmm1, LRV_XMM1_OFFSET(%rcx) + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx3) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) + vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) + + /* Save xmm0/xmm1 registers to detect if they are changed + by audit module. */ + vmovdqa %xmm0, (LRV_SIZE)(%rcx) + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) + +L(no_avx3): # endif + + fstpt LRV_ST0_OFFSET(%rcx) + fstpt LRV_ST1_OFFSET(%rcx) + + movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. + movq 40(%rbx), %rsi # Copy args pushed by PLT in register. + movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index + call _dl_call_pltexit + + /* Restore return registers. */ + movq LRV_RAX_OFFSET(%rsp), %rax + movq LRV_RDX_OFFSET(%rsp), %rdx + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx4) + + /* Check if xmm0/xmm1 registers are changed by audit module. */ + vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1 + vpmovmskb %xmm1, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 + +1: vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 + vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 + jmp 1f + +L(no_avx4): + vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 + vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 + +1: +# else + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 +# endif + + fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST0_OFFSET(%rsp) + + movq %rbx, %rsp + movq (%rsp), %rbx + cfi_restore(rbx) + cfi_def_cfa_register(%rsp) + + addq $48, %rsp # Adjust the stack to the return value + # (eats the reloc index and link_map) + cfi_adjust_cfa_offset(-48) + retq + + cfi_endproc + .size _dl_runtime_profile, .-_dl_runtime_profile #endif diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h deleted file mode 100644 index d63b7d03c4..0000000000 --- a/sysdeps/x86_64/dl-trampoline.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Partial PLT profile trampoline to save and restore x86-64 vector - registers. - Copyright (C) 2009 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - - leaq 48(%rbx), %rax - movq %rax, LR_RSP_OFFSET(%rsp) - - /* This is to provide backward binary compatility for existing - audit modules. */ - MOVXMM %xmm0, (LR_XMM_OFFSET)(%rsp) - MOVXMM %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) - MOVXMM %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) - MOVXMM %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) - MOVXMM %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) - MOVXMM %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) - MOVXMM %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) - MOVXMM %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) - -#ifdef RESTORE_AVX - /* This is to support AVX audit modules. */ - vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) - vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) - vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) - vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) - vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) - vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) - vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) - vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) - - /* Save xmm0-xmm7 registers to detect if any of them are - changed by audit module. */ - vmovdqa %xmm0, (LR_SIZE)(%rsp) - vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) - vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) - vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) - vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) - vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) - vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) - vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) -#endif - - movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. - movq 48(%rbx), %rdx # Load return address if needed. - movq 40(%rbx), %rsi # Copy args pushed by PLT in register. - movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index - leaq 16(%rbx), %r8 - call _dl_profile_fixup # Call resolver. - - movq %rax, %r11 # Save return value. - - movq 8(%rbx), %rax # Get back register content. - movq LR_RDX_OFFSET(%rsp), %rdx - movq LR_R8_OFFSET(%rsp), %r8 - movq LR_R9_OFFSET(%rsp), %r9 - -#ifdef RESTORE_AVX - /* Check if any xmm0-xmm7 registers are changed by audit - module. */ - vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 1f - - /* We restore AVX registers only if xmm0-xmm7 registers are - unchanged. */ - vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 - vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 - jmp 2f - -1: - vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 - -2: -#else - movaps (LR_XMM_OFFSET)(%rsp), %xmm0 - movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 -#endif - - movq 16(%rbx), %r10 # Anything in framesize? - testq %r10, %r10 - jns 3f - - /* There's nothing in the frame size, so there - will be no call to the _dl_call_pltexit. */ - - /* Get back registers content. */ - movq LR_RCX_OFFSET(%rsp), %rcx - movq LR_RSI_OFFSET(%rsp), %rsi - movq LR_RDI_OFFSET(%rsp), %rdi - - movq %rbx, %rsp - movq (%rsp), %rbx - cfi_restore(rbx) - cfi_def_cfa_register(%rsp) - - addq $48, %rsp # Adjust the stack to the return value - # (eats the reloc index and link_map) - cfi_adjust_cfa_offset(-48) - jmp *%r11 # Jump to function address. - -3: - cfi_adjust_cfa_offset(48) - cfi_rel_offset(%rbx, 0) - cfi_def_cfa_register(%rbx) - - /* At this point we need to prepare new stack for the function - which has to be called. We copy the original stack to a - temporary buffer of the size specified by the 'framesize' - returned from _dl_profile_fixup */ - - leaq LR_RSP_OFFSET(%rbx), %rsi # stack - addq $8, %r10 - andq $0xfffffffffffffff0, %r10 - movq %r10, %rcx - subq %r10, %rsp - movq %rsp, %rdi - shrq $3, %rcx - rep - movsq - - movq 24(%rdi), %rcx # Get back register content. - movq 32(%rdi), %rsi - movq 40(%rdi), %rdi - - call *%r11 - - mov 24(%rbx), %rsp # Drop the copied stack content - - /* Now we have to prepare the La_x86_64_retval structure for the - _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, - so we just need to allocate the sizeof(La_x86_64_retval) space on - the stack, since the alignment has already been taken care of. */ -#ifdef RESTORE_AVX - /* sizeof(La_x86_64_retval). Need extra space for 2 SSE - registers to detect if xmm0/xmm1 registers are changed - by audit module. */ - subq $(LRV_SIZE + XMM_SIZE*2), %rsp -#else - subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval) -#endif - movq %rsp, %rcx # La_x86_64_retval argument to %rcx. - - /* Fill in the La_x86_64_retval structure. */ - movq %rax, LRV_RAX_OFFSET(%rcx) - movq %rdx, LRV_RDX_OFFSET(%rcx) - - MOVXMM %xmm0, LRV_XMM0_OFFSET(%rcx) - MOVXMM %xmm1, LRV_XMM1_OFFSET(%rcx) - -#ifdef RESTORE_AVX - /* This is to support AVX audit modules. */ - vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) - vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) - - /* Save xmm0/xmm1 registers to detect if they are changed - by audit module. */ - vmovdqa %xmm0, (LRV_SIZE)(%rcx) - vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) -#endif - - fstpt LRV_ST0_OFFSET(%rcx) - fstpt LRV_ST1_OFFSET(%rcx) - - movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. - movq 40(%rbx), %rsi # Copy args pushed by PLT in register. - movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index - call _dl_call_pltexit - - /* Restore return registers. */ - movq LRV_RAX_OFFSET(%rsp), %rax - movq LRV_RDX_OFFSET(%rsp), %rdx - -#ifdef RESTORE_AVX - /* Check if xmm0/xmm1 registers are changed by audit module. */ - vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 - vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 4f - - /* We restore AVX registers only if xmm0/xmm1 registers are - unchanged. */ - vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm0 - vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi - cmpl $0xffff, %esi - jne 4f - - vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 - vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 - jmp 5f - -4: - vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 - vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 -5: -#else - movaps LRV_XMM0_OFFSET(%rsp), %xmm0 - movaps LRV_XMM1_OFFSET(%rsp), %xmm1 -#endif - - fldt LRV_ST0_OFFSET(%rsp) - fldt LRV_ST0_OFFSET(%rsp) - - movq %rbx, %rsp - movq (%rsp), %rbx - cfi_restore(rbx) - cfi_def_cfa_register(%rsp) - - addq $48, %rsp # Adjust the stack to the return value - # (eats the reloc index and link_map) - cfi_adjust_cfa_offset(-48) - retq From 47fc9b710bcadb4196f8ef71813d6724d954fcb2 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 15 Jul 2009 17:51:11 -0700 Subject: [PATCH 19/50] Fix typo in last change. --- sysdeps/x86_64/dl-trampoline.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 2f55639662..d09001bb58 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -142,7 +142,7 @@ L(have_avx): testl $(1 << 28), %ecx jne 2f negl %eax -2: movl %eax, L(have_eax)(%rip) +2: movl %eax, L(have_avx)(%rip) cmpl $0, %eax 1: js L(no_avx1) From ca419225a3c4f9f341eddf582b201211d1bf2aec Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 15 Jul 2009 17:59:14 -0700 Subject: [PATCH 20/50] Fix thinko in AVX audit patch. Don't use AVX instructions too often. --- sysdeps/x86_64/dl-trampoline.S | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index d09001bb58..7f20491130 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -249,17 +249,7 @@ L(no_avx1): jmp 1f L(no_avx2): - vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 - -1: -# else +# endif movaps (LR_XMM_OFFSET)(%rsp), %xmm0 movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 @@ -268,9 +258,8 @@ L(no_avx2): movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 -# endif - movq 16(%rbx), %r10 # Anything in framesize? +1: movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 jns 3f @@ -390,16 +379,11 @@ L(no_avx3): jmp 1f L(no_avx4): - vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 - vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 - -1: -# else +# endif movaps LRV_XMM0_OFFSET(%rsp), %xmm0 movaps LRV_XMM1_OFFSET(%rsp), %xmm1 -# endif - fldt LRV_ST1_OFFSET(%rsp) +1: fldt LRV_ST1_OFFSET(%rsp) fldt LRV_ST0_OFFSET(%rsp) movq %rbx, %rsp From e26c9b84155f31b37730fec7621f1d9a805b314d Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 16 Jul 2009 07:00:34 -0700 Subject: [PATCH 21/50] memcmp implementation for x86-64 using SSE2. --- ChangeLog | 4 + sysdeps/x86_64/memcmp.S | 359 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 363 insertions(+) create mode 100644 sysdeps/x86_64/memcmp.S diff --git a/ChangeLog b/ChangeLog index c355ea4be1..87db19e000 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2009-07-15 H.J. Lu + + * sysdeps/x86_64/memcmp.S: New file. + 2009-07-15 Ulrich Drepper * sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into... diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S new file mode 100644 index 0000000000..165f42e17d --- /dev/null +++ b/sysdeps/x86_64/memcmp.S @@ -0,0 +1,359 @@ +/* memcmp with SSE2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (memcmp) + test %rdx, %rdx + jz L(finz) + cmpq $1, %rdx + jle L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 + jge L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ +L(small): + testq $1, %r10 + jz L(s2b) + movzbl (%rdi), %eax + movzbl (%rdi, %rsi), %edx + subq $1, %r10 + je L(finz1) + addq $1, %rdi + subl %edx, %eax + jnz L(exit) +L(s2b): + testq $2, %r10 + jz L(s4b) + movzwl (%rdi), %eax + movzwl (%rdi, %rsi), %edx + subq $2, %r10 + je L(fin2_7) + addq $2, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s4b): + testq $4, %r10 + jz L(s8b) + movl (%rdi), %eax + movl (%rdi, %rsi), %edx + subq $4, %r10 + je L(fin2_7) + addq $4, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s8b): + testq $8, %r10 + jz L(s16b) + movq (%rdi), %rax + movq (%rdi, %rsi), %rdx + subq $8, %r10 + je L(fin2_7) + addq $8, %rdi + cmpq %rdx, %rax + jnz L(fin2_7) +L(s16b): + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + xorl %eax, %eax + subl $0xffff, %edx + jz L(finz) + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx + movzbl (%rcx), %eax + movzbl (%rsi, %rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(finr1b): + movzbl (%rdi), %eax + movzbl (%rsi), %edx +L(finz1): + subl %edx, %eax +L(exit): + ret + + .p2align 4,, 4 +L(fin2_7): + cmpq %rdx, %rax + jz L(finz) + movq %rax, %r11 + subq %rdx, %r11 + bsfq %r11, %rcx + sarq $3, %rcx + salq $3, %rcx + sarq %cl, %rax + movzbl %al, %eax + sarq %cl, %rdx + movzbl %dl, %edx + subl %edx, %eax + ret + + .p2align 4,, 4 +L(finz): + xorl %eax, %eax + ret + + /* For blocks bigger than 32 bytes + 1. Advance one of the addr pointer to be 16B aligned. + 2. Treat the case of both addr pointers aligned to 16B + separately to avoid movdqu. + 3. Handle any blocks of greater than 64 consecutive bytes with + unrolling to reduce branches. + 4. At least one addr pointer is 16B aligned, use memory version + of pcmbeqb. + */ + .p2align 4,, 4 +L(gt32): + movq %rdx, %r11 + addq %rdi, %r11 + movq %rdi, %r8 + + andq $15, %r8 + jz L(16am) + /* Both pointers may be misaligned. */ + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + subl $0xffff, %edx + jnz L(neq) + neg %r8 + leaq 16(%rdi, %r8), %rdi +L(16am): + /* Handle two 16B aligned pointers separately. */ + testq $15, %rsi + jz L(ATR) + testq $16, %rdi + jz L(A32) + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi +L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi + jge L(mt32) + +L(A64main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A64main) + +L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(A32main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A32main) +L(mt16): + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + + .p2align 4,, 4 +L(neq): + bsfl %edx, %ecx + movzbl (%rdi, %rcx), %eax + addq %rdi, %rsi + movzbl (%rsi,%rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + testq $16, %rdi + jz L(ATR32) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + je L(mt16) + +L(ATR32): + movq %r11, %r10 + andq $-64, %r10 + testq $32, %rdi + jz L(ATR64) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(ATR64): + cmpq %rdi, %r10 + je L(mt32) + +L(ATR64main): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + jne L(ATR64main) + + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %r10, %rdi + jne L(ATR32res) + + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + /* Align to 16byte to improve instruction fetch. */ + .p2align 4,, 4 +END(memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) From 24a12a5a5f7ea63bc349f219b9fbb722c009a719 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 16 Jul 2009 07:02:27 -0700 Subject: [PATCH 22/50] Fix up whitespaces in new memcmp for x86-64. --- sysdeps/x86_64/memcmp.S | 84 ++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index 165f42e17d..a9fe13ae58 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -26,7 +26,7 @@ ENTRY (memcmp) jz L(finz) cmpq $1, %rdx jle L(finr1b) - subq %rdi, %rsi + subq %rdi, %rsi movq %rdx, %r10 cmpq $32, %r10 jge L(gt32) @@ -37,7 +37,7 @@ L(small): movzbl (%rdi), %eax movzbl (%rdi, %rsi), %edx subq $1, %r10 - je L(finz1) + je L(finz1) addq $1, %rdi subl %edx, %eax jnz L(exit) @@ -47,7 +47,7 @@ L(s2b): movzwl (%rdi), %eax movzwl (%rdi, %rsi), %edx subq $2, %r10 - je L(fin2_7) + je L(fin2_7) addq $2, %rdi cmpl %edx, %eax jnz L(fin2_7) @@ -57,7 +57,7 @@ L(s4b): movl (%rdi), %eax movl (%rdi, %rsi), %edx subq $4, %r10 - je L(fin2_7) + je L(fin2_7) addq $4, %rdi cmpl %edx, %eax jnz L(fin2_7) @@ -67,7 +67,7 @@ L(s8b): movq (%rdi), %rax movq (%rdi, %rsi), %rdx subq $8, %r10 - je L(fin2_7) + je L(fin2_7) addq $8, %rdi cmpq %rdx, %rax jnz L(fin2_7) @@ -76,11 +76,11 @@ L(s16b): movdqu (%rdi, %rsi), %xmm0 pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %edx - xorl %eax, %eax + xorl %eax, %eax subl $0xffff, %edx jz L(finz) - bsfl %edx, %ecx - leaq (%rdi, %rcx), %rcx + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx movzbl (%rcx), %eax movzbl (%rsi, %rcx), %edx jmp L(finz1) @@ -88,7 +88,7 @@ L(s16b): .p2align 4,, 4 L(finr1b): movzbl (%rdi), %eax - movzbl (%rsi), %edx + movzbl (%rsi), %edx L(finz1): subl %edx, %eax L(exit): @@ -98,24 +98,24 @@ L(exit): L(fin2_7): cmpq %rdx, %rax jz L(finz) - movq %rax, %r11 - subq %rdx, %r11 + movq %rax, %r11 + subq %rdx, %r11 bsfq %r11, %rcx - sarq $3, %rcx + sarq $3, %rcx salq $3, %rcx - sarq %cl, %rax + sarq %cl, %rax movzbl %al, %eax - sarq %cl, %rdx + sarq %cl, %rdx movzbl %dl, %edx subl %edx, %eax - ret + ret .p2align 4,, 4 L(finz): xorl %eax, %eax ret - /* For blocks bigger than 32 bytes + /* For blocks bigger than 32 bytes 1. Advance one of the addr pointer to be 16B aligned. 2. Treat the case of both addr pointers aligned to 16B separately to avoid movdqu. @@ -128,10 +128,10 @@ L(finz): L(gt32): movq %rdx, %r11 addq %rdi, %r11 - movq %rdi, %r8 + movq %rdi, %r8 andq $15, %r8 - jz L(16am) + jz L(16am) /* Both pointers may be misaligned. */ movdqu (%rdi), %xmm1 movdqu (%rdi, %rsi), %xmm0 @@ -156,8 +156,8 @@ L(16am): L(A32): movq %r11, %r10 andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) + cmpq %r10, %rdi + jge L(mt16) /* Pre-unroll to be ready for unrolled 64B loop. */ testq $32, %rdi jz L(A64) @@ -167,7 +167,7 @@ L(A32): subl $0xffff, %edx jnz L(neq) addq $16, %rdi - + movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx @@ -178,9 +178,9 @@ L(A32): L(A64): movq %r11, %r10 andq $-64, %r10 - cmpq %r10, %rdi - jge L(mt32) - + cmpq %r10, %rdi + jge L(mt32) + L(A64main): movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 @@ -188,7 +188,7 @@ L(A64main): subl $0xffff, %edx jnz L(neq) addq $16, %rdi - + movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx @@ -216,8 +216,8 @@ L(A64main): L(mt32): movq %r11, %r10 andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) + cmpq %r10, %rdi + jge L(mt16) L(A32main): movdqu (%rdi,%rsi), %xmm0 @@ -226,7 +226,7 @@ L(A32main): subl $0xffff, %edx jnz L(neq) addq $16, %rdi - + movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx @@ -239,23 +239,23 @@ L(A32main): L(mt16): subq %rdi, %r11 je L(finz) - movq %r11, %r10 - jmp L(small) + movq %r11, %r10 + jmp L(small) .p2align 4,, 4 L(neq): - bsfl %edx, %ecx + bsfl %edx, %ecx movzbl (%rdi, %rcx), %eax - addq %rdi, %rsi + addq %rdi, %rsi movzbl (%rsi,%rcx), %edx jmp L(finz1) .p2align 4,, 4 L(ATR): movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) testq $16, %rdi jz L(ATR32) @@ -290,7 +290,7 @@ L(ATR32): L(ATR64): cmpq %rdi, %r10 - je L(mt32) + je L(mt32) L(ATR64main): movdqa (%rdi,%rsi), %xmm0 @@ -324,9 +324,9 @@ L(ATR64main): jne L(ATR64main) movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) L(ATR32res): movdqa (%rdi,%rsi), %xmm0 @@ -343,13 +343,13 @@ L(ATR32res): jnz L(neq) addq $16, %rdi - cmpq %r10, %rdi + cmpq %r10, %rdi jne L(ATR32res) subq %rdi, %r11 je L(finz) - movq %r11, %r10 - jmp L(small) + movq %r11, %r10 + jmp L(small) /* Align to 16byte to improve instruction fetch. */ .p2align 4,, 4 END(memcmp) From c8027cced1d3e7803c440cb13d4294754d8791e2 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 16 Jul 2009 07:15:15 -0700 Subject: [PATCH 23/50] Optimize restoring of ymm registers on x86-64. The patch mainly reduces the code size but also avoids some jumps. --- ChangeLog | 5 +++ sysdeps/x86_64/dl-trampoline.S | 77 +++++++++++++++------------------- 2 files changed, 39 insertions(+), 43 deletions(-) diff --git a/ChangeLog b/ChangeLog index 87db19e000..1bfdd7b56d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-16 Ulrich Drepper + + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Optimize + restoring of ymm registers a bit. + 2009-07-15 H.J. Lu * sysdeps/x86_64/memcmp.S: New file. diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 7f20491130..49d239f075 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -185,81 +185,73 @@ L(no_avx1): movq LR_R8_OFFSET(%rsp), %r8 movq LR_R9_OFFSET(%rsp), %r9 + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + # ifdef HAVE_AVX_SUPPORT cmpl $0, L(have_avx)(%rip) js L(no_avx2) /* Check if any xmm0-xmm7 registers are changed by audit module. */ - vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 - vpmovmskb %xmm2, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm3 - vpmovmskb %xmm3, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm4 - vpmovmskb %xmm4, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm5 - vpmovmskb %xmm5, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm6 - vpmovmskb %xmm6, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm7 - vpmovmskb %xmm7, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 - vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 +1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 - jmp 1f L(no_avx2): +1: # endif - movaps (LR_XMM_OFFSET)(%rsp), %xmm0 - movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 - -1: movq 16(%rbx), %r10 # Anything in framesize? + movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 jns 3f @@ -358,32 +350,31 @@ L(no_avx3): movq LRV_RAX_OFFSET(%rsp), %rax movq LRV_RDX_OFFSET(%rsp), %rdx + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 + # ifdef HAVE_AVX_SUPPORT cmpl $0, L(have_avx)(%rip) js L(no_avx4) /* Check if xmm0/xmm1 registers are changed by audit module. */ - vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 - vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 + vpmovmskb %xmm2, %esi cmpl $0xffff, %esi je 1f vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 -1: vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 - vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 +1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi je 1f vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 - jmp 1f L(no_avx4): +1: # endif - movaps LRV_XMM0_OFFSET(%rsp), %xmm0 - movaps LRV_XMM1_OFFSET(%rsp), %xmm1 -1: fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST1_OFFSET(%rsp) fldt LRV_ST0_OFFSET(%rsp) movq %rbx, %rsp From 55c4ce6885b577e2b29a4de674d4062a6882afe8 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 16 Jul 2009 07:18:53 -0700 Subject: [PATCH 24/50] Remove warning and little optimization. The prototype for _dl_higher_prime_number was missing. While at it, the function is now marked with internal_function. --- ChangeLog | 4 ++++ elf/dl-misc.c | 1 + sysdeps/generic/ldsodefs.h | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1bfdd7b56d..48b5d029f3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2009-07-16 Ulrich Drepper + * sysdeps/generic/ldsodefs.h: Add prototype for + _dl_higher_prime_number. + * elf/dl-misc.c (_dl_higher_prime_number): Mark with internal_function. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Optimize restoring of ymm registers a bit. diff --git a/elf/dl-misc.c b/elf/dl-misc.c index 7d4e1a1725..d50537ade7 100644 --- a/elf/dl-misc.c +++ b/elf/dl-misc.c @@ -315,6 +315,7 @@ _dl_name_match_p (const char *name, const struct link_map *map) unsigned long int +internal_function _dl_higher_prime_number (unsigned long int n) { /* These are primes that are near, but slightly smaller than, a diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index 1e1bb4ccd4..30f9d23091 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -335,6 +335,10 @@ struct audit_ifaces extern int _dl_name_match_p (const char *__name, const struct link_map *__map) internal_function; +/* Compute next higher prime number. */ +extern unsigned long int _dl_higher_prime_number (unsigned long int n) + internal_function; + /* Function used as argument for `_dl_receive_error' function. The arguments are the error code, error string, and the objname the error occurred in. */ From bea0ac1d8703091294fe5822d982591c849b5458 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 16 Jul 2009 07:24:50 -0700 Subject: [PATCH 25/50] Use rel semantics of cas instead of acq semantics with full barrier before it in _int_free The following patch fixes catomic_compare_and_exchange_*_rel definitions (which were never used and weren't correct) and uses catomic_compare_and_exchange_val_rel in _int_free. Comparing to the pre-2009-07-02 --enable-experimental-malloc state the generated code should be identical on all arches other than ppc/ppc64 and on ppc/ppc64 should use lwsync instead of isync barrier. --- ChangeLog | 13 +++++++++++++ include/atomic.h | 32 +++++++++++++++++++++----------- malloc/malloc.c | 3 +-- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index 48b5d029f3..1e9df428c7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2009-07-13 Jakub Jelinek + + * include/atomic.h (catomic_compare_and_exchange_val_rel): If arch + overrides atomic_compare_and_exchange_val_rel, define to + atomic_compare_and_exchange_val_rel by default, otherwise default + to catomic_compare_and_exchange_val_acq. + (catomic_compare_and_exchange_bool_rel): If arch overrides + atomic_compare_and_exchange_bool_rel, define to + atomic_compare_and_exchange_bool_rel by default. + * malloc/malloc.c (_int_free): Revert 2009-07-02 change. + Use catomic_compare_and_exchange_val_rel instead of + catomic_compare_and_exchange_val_acq. + 2009-07-16 Ulrich Drepper * sysdeps/generic/ldsodefs.h: Add prototype for diff --git a/include/atomic.h b/include/atomic.h index 9366f78734..37d0111d5f 100644 --- a/include/atomic.h +++ b/include/atomic.h @@ -107,14 +107,19 @@ #endif -#ifndef atomic_compare_and_exchange_val_rel -# define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \ - atomic_compare_and_exchange_val_acq (mem, newval, oldval) +#ifndef catomic_compare_and_exchange_val_rel +# ifndef atomic_compare_and_exchange_val_rel +# define catomic_compare_and_exchange_val_rel(mem, newval, oldval) \ + catomic_compare_and_exchange_val_acq (mem, newval, oldval) +# else +# define catomic_compare_and_exchange_val_rel(mem, newval, oldval) \ + atomic_compare_and_exchange_val_rel (mem, newval, oldval) +# endif #endif -#ifndef catomic_compare_and_exchange_val_rel -# define catomic_compare_and_exchange_val_rel(mem, newval, oldval) \ +#ifndef atomic_compare_and_exchange_val_rel +# define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \ atomic_compare_and_exchange_val_acq (mem, newval, oldval) #endif @@ -155,15 +160,20 @@ #endif -#ifndef atomic_compare_and_exchange_bool_rel -# define atomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ - atomic_compare_and_exchange_bool_acq (mem, newval, oldval) +#ifndef catomic_compare_and_exchange_bool_rel +# ifndef atomic_compare_and_exchange_bool_rel +# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ + catomic_compare_and_exchange_bool_acq (mem, newval, oldval) +# else +# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ + atomic_compare_and_exchange_bool_rel (mem, newval, oldval) +# endif #endif -#ifndef catomic_compare_and_exchange_bool_rel -# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ - catomic_compare_and_exchange_bool_acq (mem, newval, oldval) +#ifndef atomic_compare_and_exchange_bool_rel +# define atomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ + atomic_compare_and_exchange_bool_acq (mem, newval, oldval) #endif diff --git a/malloc/malloc.c b/malloc/malloc.c index 70e4e58845..0c0182ec0b 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -4822,9 +4822,8 @@ _int_free(mstate av, mchunkptr p) goto errout; } p->fd = fd = old; - atomic_full_barrier (); } - while ((old = catomic_compare_and_exchange_val_acq (fb, p, fd)) != fd); + while ((old = catomic_compare_and_exchange_val_rel (fb, p, fd)) != fd); #else /* Another simple check: make sure the top of the bin is not the record we are going to add (i.e., double free). */ From bec466d922ee22b94ac0d00415fb605e136efe6e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 16 Jul 2009 09:54:34 -0700 Subject: [PATCH 26/50] Fix race in corruption check. With atomic fastbins the checks performed can race with concurrent modifications of the arena. If we detect a problem re-do the test after getting the lock. --- ChangeLog | 6 ++++++ malloc/malloc.c | 25 +++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1e9df428c7..6ddf9a1908 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2009-07-16 Ulrich Drepper + Jakub Jelinek + + * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Make check for + corruption thread-safe. + 2009-07-13 Jakub Jelinek * include/atomic.h (catomic_compare_and_exchange_val_rel): If arch diff --git a/malloc/malloc.c b/malloc/malloc.c index 0c0182ec0b..a459a2b89d 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -4799,8 +4799,29 @@ _int_free(mstate av, mchunkptr p) || __builtin_expect (chunksize (chunk_at_offset (p, size)) >= av->system_mem, 0)) { - errstr = "free(): invalid next size (fast)"; - goto errout; +#ifdef ATOMIC_FASTBINS + /* We might not have a lock at this point and concurrent modifications + of system_mem might have let to a false positive. Redo the test + after getting the lock. */ + if (have_lock + || ({ assert (locked == 0); + mutex_lock(&av->mutex); + locked = 1; + chunk_at_offset (p, size)->size <= 2 * SIZE_SZ + || chunksize (chunk_at_offset (p, size)) >= av->system_mem; + })) +#endif + { + errstr = "free(): invalid next size (fast)"; + goto errout; + } +#ifdef ATOMIC_FASTBINS + if (! have_lock) + { + (void)mutex_unlock(&av->mutex); + locked = 0; + } +#endif } if (__builtin_expect (perturb_byte, 0)) From 50158f95525ca59459a90f2a7bc65ceb892a0807 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Thu, 16 Jul 2009 09:57:32 -0700 Subject: [PATCH 27/50] Use correct release semantic in list update. nscd uses lockfree lists and we need to ensure the correct release semantics is used when adding to the list. --- ChangeLog | 6 ++++++ nscd/cache.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 6ddf9a1908..a81c5b46fb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,10 @@ 2009-07-16 Ulrich Drepper + + * nscd/cache.c (cache_add): Use atomic_compare_and_exchange_bool_rel + instead of atomic_compare_and_exchange_bool_acq to ensure pointer + is written before the list head update. + Patch by Andreas Schwab . + Jakub Jelinek * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Make check for diff --git a/nscd/cache.c b/nscd/cache.c index ab842efc29..3e6793df2f 100644 --- a/nscd/cache.c +++ b/nscd/cache.c @@ -179,7 +179,7 @@ cache_add (int type, const void *key, size_t len, struct datahead *packet, /* Put the new entry in the first position. */ do newp->next = table->head->array[hash]; - while (atomic_compare_and_exchange_bool_acq (&table->head->array[hash], + while (atomic_compare_and_exchange_bool_rel (&table->head->array[hash], (ref_t) ((char *) newp - table->data), (ref_t) newp->next)); From 137028b4d7e50f71906c1656c27079eac5a1d085 Mon Sep 17 00:00:00 2001 From: Petr Baudis Date: Thu, 16 Jul 2009 10:10:10 -0700 Subject: [PATCH 28/50] Fix lock handling in memory hander of nscd. The commit 20e498bd removes the pthread_mutex_rdlock() calls, but not the corresponding pthread_mutex_unlock() calls. Also, the database lock is never unlocked in one branch of the mempool_alloc() if. I think unreproducible random assert(dh->usable) crashes in prune_cache() were caused by this. But an easy way to make nscd threads hang with the broken locking was. --- ChangeLog | 11 +++++++++++ nscd/aicache.c | 2 -- nscd/grpcache.c | 6 +----- nscd/hstcache.c | 4 ---- nscd/initgrcache.c | 4 ---- nscd/mem.c | 6 +++--- nscd/pwdcache.c | 6 +----- nscd/servicescache.c | 4 ---- 8 files changed, 16 insertions(+), 27 deletions(-) diff --git a/ChangeLog b/ChangeLog index a81c5b46fb..37f20a4b8b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2009-07-16 Petr Baudis + + * nscd/mem.c (mempool_alloc): Fix unlock missing in the else branch. + * nscd/aicache.c: Remove bogus db->lock unlock. + * nscd/grpcache.c: Likewise. + * nscd/hstcache.c: Likewise. + * nscd/initgrcache.c: Likewise. + * nscd/pwdcache.c: Likewise. + * nscd/servicescache.c: Likewise. + 2009-07-16 Ulrich Drepper * nscd/cache.c (cache_add): Use atomic_compare_and_exchange_bool_rel @@ -5,6 +15,7 @@ is written before the list head update. Patch by Andreas Schwab . +2009-07-16 Ulrich Drepper Jakub Jelinek * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Make check for diff --git a/nscd/aicache.c b/nscd/aicache.c index 524c0a63af..8dac48e5c2 100644 --- a/nscd/aicache.c +++ b/nscd/aicache.c @@ -543,8 +543,6 @@ addhstaiX (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, uid, he == NULL); - pthread_rwlock_unlock (&db->lock); - /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; diff --git a/nscd/grpcache.c b/nscd/grpcache.c index 184d53898c..fc2008449e 100644 --- a/nscd/grpcache.c +++ b/nscd/grpcache.c @@ -146,8 +146,6 @@ cache_addgr (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, &dataset->strdata, req->key_len, &dataset->head, true, db, owner, he == NULL); - pthread_rwlock_unlock (&db->lock); - /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -367,12 +365,10 @@ cache_addgr (struct database_dyn *db, int fd, request_header *req, (void) cache_add (GETGRBYGID, cp, key_offset, &dataset->head, false, db, owner, false); } - - out: - pthread_rwlock_unlock (&db->lock); } } +out: if (__builtin_expect (written != total, 0) && debug_level > 0) { char buf[256]; diff --git a/nscd/hstcache.c b/nscd/hstcache.c index 51e2273960..77ffcdf880 100644 --- a/nscd/hstcache.c +++ b/nscd/hstcache.c @@ -153,8 +153,6 @@ cache_addhst (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, &dataset->strdata, req->key_len, &dataset->head, true, db, owner, he == NULL); - pthread_rwlock_unlock (&db->lock); - /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -404,8 +402,6 @@ cache_addhst (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, owner, he == NULL); - - pthread_rwlock_unlock (&db->lock); } } diff --git a/nscd/initgrcache.c b/nscd/initgrcache.c index c33aaf315f..f8d4742d16 100644 --- a/nscd/initgrcache.c +++ b/nscd/initgrcache.c @@ -230,8 +230,6 @@ addinitgroupsX (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, uid, he == NULL); - pthread_rwlock_unlock (&db->lock); - /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -388,8 +386,6 @@ addinitgroupsX (struct database_dyn *db, int fd, request_header *req, (void) cache_add (INITGROUPS, cp, req->key_len, &dataset->head, true, db, uid, he == NULL); - - pthread_rwlock_unlock (&db->lock); } } diff --git a/nscd/mem.c b/nscd/mem.c index fcea6dbd03..80ea951146 100644 --- a/nscd/mem.c +++ b/nscd/mem.c @@ -566,9 +566,6 @@ mempool_alloc (struct database_dyn *db, size_t len, int data_alloc) } } - if (data_alloc) - pthread_rwlock_unlock (&db->lock); - if (! db->last_alloc_failed) { dbg_log (_("no more memory for database '%s'"), dbnames[db - dbs]); @@ -591,5 +588,8 @@ mempool_alloc (struct database_dyn *db, size_t len, int data_alloc) pthread_mutex_unlock (&db->memlock); + if (data_alloc) + pthread_rwlock_unlock (&db->lock); + return res; } diff --git a/nscd/pwdcache.c b/nscd/pwdcache.c index 2338e7e1e0..fc5b44eef0 100644 --- a/nscd/pwdcache.c +++ b/nscd/pwdcache.c @@ -153,8 +153,6 @@ cache_addpw (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, owner, he == NULL); - pthread_rwlock_unlock (&db->lock); - /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -362,12 +360,10 @@ cache_addpw (struct database_dyn *db, int fd, request_header *req, (void) cache_add (GETPWBYUID, cp, key_offset, &dataset->head, false, db, owner, false); } - - out: - pthread_rwlock_unlock (&db->lock); } } +out: if (__builtin_expect (written != total, 0) && debug_level > 0) { char buf[256]; diff --git a/nscd/servicescache.c b/nscd/servicescache.c index dc98d3005a..c965c972a3 100644 --- a/nscd/servicescache.c +++ b/nscd/servicescache.c @@ -136,8 +136,6 @@ cache_addserv (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, &dataset->strdata, req->key_len, &dataset->head, true, db, owner, he == NULL); - pthread_rwlock_unlock (&db->lock); - /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -317,8 +315,6 @@ cache_addserv (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, owner, he == NULL); - - pthread_rwlock_unlock (&db->lock); } } From 1c0ab5bd34159d2ae53390571113844ebadc161b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 16 Jul 2009 23:37:50 -0700 Subject: [PATCH 29/50] Handle overly large answer buffers in resolver. In EDNS0 records the maximum result size is transmitted in a 16 bit value. Large buffer sizes were handled incorrectly by using only the low 16 bits. Fix this by limiting the size to 0xffff. --- ChangeLog | 6 ++++++ resolv/res_mkquery.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 37f20a4b8b..7dab061790 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2009-07-16 Ulrich Drepper + + [BZ #10360] + * resolv/res-mkquery.c (__res_nopt): If anslen is > 0xffff store + 0xffff in the EDNS0 record. + 2009-07-16 Petr Baudis * nscd/mem.c (mempool_alloc): Fix unlock missing in the else branch. diff --git a/resolv/res_mkquery.c b/resolv/res_mkquery.c index ae0cdb417e..2dda4c0f45 100644 --- a/resolv/res_mkquery.c +++ b/resolv/res_mkquery.c @@ -244,7 +244,7 @@ __res_nopt(res_state statp, *cp++ = 0; /* "." */ NS_PUT16(T_OPT, cp); /* TYPE */ - NS_PUT16(anslen & 0xffff, cp); /* CLASS = UDP payload size */ + NS_PUT16(MIN(anslen, 0xffff), cp); /* CLASS = UDP payload size */ *cp++ = NOERROR; /* extended RCODE */ *cp++ = 0; /* EDNS version */ /* XXX Once we support DNSSEC we change the flag value here. */ From 09f97a8fbf8be28cc489a7baa1bead17e2cbf764 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 05:58:12 -0700 Subject: [PATCH 30/50] Add missing BZ number in ChangeLog. --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 7dab061790..6418f866f4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -6,6 +6,7 @@ 2009-07-16 Petr Baudis + [BZ #10402] * nscd/mem.c (mempool_alloc): Fix unlock missing in the else branch. * nscd/aicache.c: Remove bogus db->lock unlock. * nscd/grpcache.c: Likewise. From 00ebd7ed58df389a78e41dece058048725cb585e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 07:49:16 -0700 Subject: [PATCH 31/50] Revert "Fix lock handling in memory hander of nscd." This reverts commit 137028b4d7e50f71906c1656c27079eac5a1d085. Conflicts: ChangeLog --- ChangeLog | 11 ----------- nscd/aicache.c | 2 ++ nscd/grpcache.c | 6 +++++- nscd/hstcache.c | 4 ++++ nscd/initgrcache.c | 4 ++++ nscd/mem.c | 6 +++--- nscd/pwdcache.c | 6 +++++- nscd/servicescache.c | 4 ++++ 8 files changed, 27 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6418f866f4..dce8cbc232 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,17 +4,6 @@ * resolv/res-mkquery.c (__res_nopt): If anslen is > 0xffff store 0xffff in the EDNS0 record. -2009-07-16 Petr Baudis - - [BZ #10402] - * nscd/mem.c (mempool_alloc): Fix unlock missing in the else branch. - * nscd/aicache.c: Remove bogus db->lock unlock. - * nscd/grpcache.c: Likewise. - * nscd/hstcache.c: Likewise. - * nscd/initgrcache.c: Likewise. - * nscd/pwdcache.c: Likewise. - * nscd/servicescache.c: Likewise. - 2009-07-16 Ulrich Drepper * nscd/cache.c (cache_add): Use atomic_compare_and_exchange_bool_rel diff --git a/nscd/aicache.c b/nscd/aicache.c index 8dac48e5c2..524c0a63af 100644 --- a/nscd/aicache.c +++ b/nscd/aicache.c @@ -543,6 +543,8 @@ addhstaiX (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, uid, he == NULL); + pthread_rwlock_unlock (&db->lock); + /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; diff --git a/nscd/grpcache.c b/nscd/grpcache.c index fc2008449e..184d53898c 100644 --- a/nscd/grpcache.c +++ b/nscd/grpcache.c @@ -146,6 +146,8 @@ cache_addgr (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, &dataset->strdata, req->key_len, &dataset->head, true, db, owner, he == NULL); + pthread_rwlock_unlock (&db->lock); + /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -365,10 +367,12 @@ cache_addgr (struct database_dyn *db, int fd, request_header *req, (void) cache_add (GETGRBYGID, cp, key_offset, &dataset->head, false, db, owner, false); } + + out: + pthread_rwlock_unlock (&db->lock); } } -out: if (__builtin_expect (written != total, 0) && debug_level > 0) { char buf[256]; diff --git a/nscd/hstcache.c b/nscd/hstcache.c index 77ffcdf880..51e2273960 100644 --- a/nscd/hstcache.c +++ b/nscd/hstcache.c @@ -153,6 +153,8 @@ cache_addhst (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, &dataset->strdata, req->key_len, &dataset->head, true, db, owner, he == NULL); + pthread_rwlock_unlock (&db->lock); + /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -402,6 +404,8 @@ cache_addhst (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, owner, he == NULL); + + pthread_rwlock_unlock (&db->lock); } } diff --git a/nscd/initgrcache.c b/nscd/initgrcache.c index f8d4742d16..c33aaf315f 100644 --- a/nscd/initgrcache.c +++ b/nscd/initgrcache.c @@ -230,6 +230,8 @@ addinitgroupsX (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, uid, he == NULL); + pthread_rwlock_unlock (&db->lock); + /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -386,6 +388,8 @@ addinitgroupsX (struct database_dyn *db, int fd, request_header *req, (void) cache_add (INITGROUPS, cp, req->key_len, &dataset->head, true, db, uid, he == NULL); + + pthread_rwlock_unlock (&db->lock); } } diff --git a/nscd/mem.c b/nscd/mem.c index 80ea951146..fcea6dbd03 100644 --- a/nscd/mem.c +++ b/nscd/mem.c @@ -566,6 +566,9 @@ mempool_alloc (struct database_dyn *db, size_t len, int data_alloc) } } + if (data_alloc) + pthread_rwlock_unlock (&db->lock); + if (! db->last_alloc_failed) { dbg_log (_("no more memory for database '%s'"), dbnames[db - dbs]); @@ -588,8 +591,5 @@ mempool_alloc (struct database_dyn *db, size_t len, int data_alloc) pthread_mutex_unlock (&db->memlock); - if (data_alloc) - pthread_rwlock_unlock (&db->lock); - return res; } diff --git a/nscd/pwdcache.c b/nscd/pwdcache.c index fc5b44eef0..2338e7e1e0 100644 --- a/nscd/pwdcache.c +++ b/nscd/pwdcache.c @@ -153,6 +153,8 @@ cache_addpw (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, owner, he == NULL); + pthread_rwlock_unlock (&db->lock); + /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -360,10 +362,12 @@ cache_addpw (struct database_dyn *db, int fd, request_header *req, (void) cache_add (GETPWBYUID, cp, key_offset, &dataset->head, false, db, owner, false); } + + out: + pthread_rwlock_unlock (&db->lock); } } -out: if (__builtin_expect (written != total, 0) && debug_level > 0) { char buf[256]; diff --git a/nscd/servicescache.c b/nscd/servicescache.c index c965c972a3..dc98d3005a 100644 --- a/nscd/servicescache.c +++ b/nscd/servicescache.c @@ -136,6 +136,8 @@ cache_addserv (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, &dataset->strdata, req->key_len, &dataset->head, true, db, owner, he == NULL); + pthread_rwlock_unlock (&db->lock); + /* Mark the old entry as obsolete. */ if (dh != NULL) dh->usable = false; @@ -315,6 +317,8 @@ cache_addserv (struct database_dyn *db, int fd, request_header *req, (void) cache_add (req->type, key_copy, req->key_len, &dataset->head, true, db, owner, he == NULL); + + pthread_rwlock_unlock (&db->lock); } } From d52c96e73a20d1c1d266f783fc31df6759207ea0 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 12:14:09 -0700 Subject: [PATCH 32/50] Replace hand-coded unwind tables from x86-64 pthread_cond_wait. --- nptl/ChangeLog | 6 ++ .../sysv/linux/x86_64/pthread_cond_wait.S | 94 ++++--------------- 2 files changed, 24 insertions(+), 76 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index ec9ace7d31..2551faa0d9 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,9 @@ +2009-07-17 Ulrich Drepper + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S + (__condvar_cleanup): Rewrite to use cfi directives instead of + hand-coded unwind tables. + 2009-06-12 Ulrich Drepper * Makefile (libpthread-routines): Add pthread_sigqueue. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index e5e802d531..146a414d41 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc. +/* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -33,7 +33,10 @@ .globl __condvar_cleanup .hidden __condvar_cleanup __condvar_cleanup: + cfi_startproc pushq %r12 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) /* Get internal lock. */ movq %rdi, %r8 @@ -141,8 +144,11 @@ __condvar_cleanup: callq __pthread_mutex_cond_lock popq %r12 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) retq + cfi_endproc .size __condvar_cleanup, .-__condvar_cleanup @@ -151,12 +157,14 @@ __condvar_cleanup: .type __pthread_cond_wait, @function .align 16 __pthread_cond_wait: -.LSTARTCODE: + cfi_startproc pushq %r12 -.Lpush_r12: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) #define FRAME_SIZE 64 subq $FRAME_SIZE, %rsp -.Lsubq: + cfi_adjust_cfa_offset(FRAME_SIZE) + /* Stack frame: rsp + 64 @@ -332,17 +340,19 @@ __pthread_cond_wait: movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock 14: addq $FRAME_SIZE, %rsp -.Laddq: + cfi_adjust_cfa_offset(-FRAME_SIZE) popq %r12 -.Lpop_r12: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) /* We return the result of the mutex_lock operation. */ retq /* Initial locking failed. */ 1: -.LSbl1: + cfi_adjust_cfa_offset(8 + FRAME_SIZE) + cfi_rel_offset(%r12, FRAME_SIZE) #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -414,75 +424,7 @@ __pthread_cond_wait: 13: movq %r10, %rax jmp 14b -.LENDCODE: + cfi_endproc .size __pthread_cond_wait, .-__pthread_cond_wait versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait, GLIBC_2_3_2) - - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long L(ENDCIE)-L(STARTCIE) # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zR" # NUL-terminated augmentation - # string. -#else - .ascii "\0" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 1 # Augmentation value length. - .byte 0x1b # Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x8 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code -#else - .long .LSTARTCODE # Start address of the code. -#endif - .long .LENDCODE-.LSTARTCODE # Length of the code. -#ifdef SHARED - .uleb128 0 # No augmentation data. -#endif - .byte 0x40+.Lpush_r12-.LSTARTCODE # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 0x40+.Lsubq-.Lpush_r12 # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16+FRAME_SIZE - .byte 3 # DW_CFA_advance_loc2 - .2byte .Laddq-.Lsubq - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x40+.Lpop_r12-.Laddq # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 0xcc # DW_CFA_restore %r12 - .byte 0x40+.LSbl1-.Lpop_r12 # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 80 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .align 8 -.LENDFDE: From 63601ccd1614250bf91e47f46c6b24b53dfd6af6 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 14:45:08 -0700 Subject: [PATCH 33/50] Replace hand-coded unwind tables from x86-64 pthread_once. --- ChangeLog | 5 + nptl/ChangeLog | 2 + .../unix/sysv/linux/x86_64/pthread_once.S | 128 +++--------------- sysdeps/generic/sysdep.h | 43 +++++- 4 files changed, 65 insertions(+), 113 deletions(-) diff --git a/ChangeLog b/ChangeLog index dce8cbc232..da91742d21 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-17 Ulrich Drepper + + * sysdeps/generic/sysdep.h: Define cfi_personality, cfi_lsda, + CFI_PERSONALITY, CFI_LSDA, and DW_EH_PE_* constants. + 2009-07-16 Ulrich Drepper [BZ #10360] diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 2551faa0d9..084ebe63b6 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -3,6 +3,8 @@ * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (__condvar_cleanup): Rewrite to use cfi directives instead of hand-coded unwind tables. + * sysdeps/unix/sysv/linux/x86_64/pthread_once.S (__pthread_once): + Likewise. 2009-06-12 Ulrich Drepper diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S index c3b2b51bdb..ccc18493a2 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -17,6 +17,7 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#include #include #include #include @@ -32,6 +33,15 @@ .align 16 __pthread_once: .LSTARTCODE: + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif testl $2, (%rdi) jz 1f xorl %eax, %eax @@ -39,7 +49,7 @@ __pthread_once: /* Preserve the function pointer. */ 1: pushq %rsi -.Lpush_rsi: + cfi_adjust_cfa_offset(8) xorq %r10, %r10 /* Not yet initialized or initialization in progress. @@ -86,9 +96,9 @@ __pthread_once: /* Preserve the pointer to the control variable. */ 3: pushq %rdi -.Lpush_rdi: + cfi_adjust_cfa_offset(8) pushq %rdi -.Lpush_rdi2: + cfi_adjust_cfa_offset(8) .LcleanupSTART: callq *16(%rsp) @@ -96,14 +106,14 @@ __pthread_once: /* Get the control variable address back. */ popq %rdi -.Lpop_rdi: + cfi_adjust_cfa_offset(-8) /* Sucessful run of the initializer. Signal that we are done. */ LOCK incl (%rdi) addq $8, %rsp -.Ladd1: + cfi_adjust_cfa_offset(-8) /* Wake up all other threads. */ movl $0x7fffffff, %edx @@ -117,10 +127,9 @@ __pthread_once: syscall 4: addq $8, %rsp -.Ladd2: + cfi_adjust_cfa_offset(-8) xorl %eax, %eax retq - .size __pthread_once,.-__pthread_once @@ -134,6 +143,7 @@ pthread_once = __pthread_once .type clear_once_control,@function .align 16 clear_once_control: + cfi_adjust_cfa_offset(3 * 8) movq (%rsp), %rdi movq %rax, %r8 movl $0, (%rdi) @@ -153,15 +163,15 @@ clear_once_control: call _Unwind_Resume@PLT hlt .LENDCODE: + cfi_endproc .size clear_once_control,.-clear_once_control .section .gcc_except_table,"a",@progbits .LexceptSTART: - .byte 0xff # @LPStart format (omit) - .byte 0xff # @TType format (omit) - .byte 0x01 # call-site format - # DW_EH_PE_uleb128 + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: .uleb128 .LcleanupSTART-.LSTARTCODE @@ -175,100 +185,6 @@ clear_once_control: .Lcstend: - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zPLR" # NUL-terminated augmentation - # string. -#else - .string "zPL" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 7 # Augmentation value length. - .byte 0x9b # Personality: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4 - # + DW_EH_PE_indirect - .long DW.ref.__gcc_personality_v0-. - .byte 0x1b # LSDA Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. - .byte 0x1b # FDE Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#else - .uleb128 10 # Augmentation value length. - .byte 0x0 # Personality: absolute - .quad __gcc_personality_v0 - .byte 0x0 # LSDA Encoding: absolute -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x10 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code. - .long .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 4 # Augmentation size - .long .LexceptSTART-. -#else - .quad .LSTARTCODE # Start address of the code. - .quad .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 8 # Augmentation size - .quad .LexceptSTART -#endif - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_rsi-.LSTARTCODE - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_rdi-.Lpush_rsi - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_rdi2-.Lpush_rdi - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_rdi-.Lpush_rdi2 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 4 # DW_CFA_advance_loc4 - .long .Ladd1-.Lpop_rdi - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 4 # DW_CFA_advance_loc4 - .long .Ladd2-.Ladd1 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 4 # DW_CFA_advance_loc4 - .long clear_once_control-.Ladd2 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 -#if 0 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_rdi3-clear_once_control - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 -#endif - .align 8 -.LENDFDE: - - #ifdef SHARED .hidden DW.ref.__gcc_personality_v0 .weak DW.ref.__gcc_personality_v0 diff --git a/sysdeps/generic/sysdep.h b/sysdeps/generic/sysdep.h index 15d951c777..54884d9afe 100644 --- a/sysdeps/generic/sysdep.h +++ b/sysdeps/generic/sysdep.h @@ -1,5 +1,5 @@ /* Generic asm macros used on many machines. - Copyright (C) 1991,92,93,96,98,2002,2003 Free Software Foundation, Inc. + Copyright (C) 1991,92,93,96,98,2002,2003,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -39,13 +39,13 @@ #ifdef __ASSEMBLER__ /* Mark the end of function named SYM. This is used on some platforms to generate correct debugging information. */ -#ifndef END -#define END(sym) -#endif +# ifndef END +# define END(sym) +# endif -#ifndef JUMPTARGET -#define JUMPTARGET(sym) sym -#endif +# ifndef JUMPTARGET +# define JUMPTARGET(sym) sym +# endif /* Makros to generate eh_frame unwind information. */ # ifdef HAVE_ASM_CFI_DIRECTIVES @@ -65,6 +65,8 @@ # define cfi_remember_state .cfi_remember_state # define cfi_restore_state .cfi_restore_state # define cfi_window_save .cfi_window_save +# define cfi_personality(enc, exp) .cfi_personality enc, exp +# define cfi_lsda(enc, exp) .cfi_lsda enc, exp # else # define cfi_startproc # define cfi_endproc @@ -82,6 +84,8 @@ # define cfi_remember_state # define cfi_restore_state # define cfi_window_save +# define cfi_personality(enc, exp) +# define cfi_lsda(enc, exp) # endif #else /* ! ASSEMBLER */ @@ -116,6 +120,10 @@ ".cfi_restore_state" # define CFI_WINDOW_SAVE \ ".cfi_window_save" +# define CFI_PERSONALITY(enc, exp) \ + ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) +# define CFI_LSDA(enc, exp) \ + ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) # else # define CFI_STARTPROC # define CFI_ENDPROC @@ -132,6 +140,27 @@ # define CFI_REMEMBER_STATE # define CFI_RESTORE_STATE # define CFI_WINDOW_SAVE +# define CFI_PERSONALITY(enc, exp) +# define CFI_LSDA(enc, exp) # endif #endif /* __ASSEMBLER__ */ + +/* Values used for encoding parameter of cfi_personality and cfi_lsda. */ +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0a +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 From c3c2f3cf56ffcfd200f6c26aa5492049140bbbcb Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 15:31:36 -0700 Subject: [PATCH 34/50] Replace hand-coded unwind tables from x86-64 sem_wait. --- nptl/ChangeLog | 1 + .../sysdeps/unix/sysv/linux/x86_64/sem_wait.S | 126 ++++-------------- 2 files changed, 26 insertions(+), 101 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 084ebe63b6..5204ad10ca 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -5,6 +5,7 @@ hand-coded unwind tables. * sysdeps/unix/sysv/linux/x86_64/pthread_once.S (__pthread_once): Likewise. + * sysdeps/unix/sysv/linux/x86_64/sem_wait.S (sem_wait): Likewise. 2009-06-12 Ulrich Drepper diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S index 5320a91e19..de6a53b015 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -31,10 +31,21 @@ .align 16 sem_wait: .LSTARTCODE: + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif pushq %r12 -.Lpush_r12: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 -.Lpush_r13: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) movq %rdi, %r13 #if VALUE == 0 @@ -57,13 +68,17 @@ sem_wait: 7: xorl %eax, %eax 9: popq %r13 -.Lpop_r13: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) popq %r12 -.Lpop_r12: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) retq -.Lafter_retq: + cfi_adjust_cfa_offset(2 * 8) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) 1: LOCK addq $1, NWAITERS(%r13) @@ -141,15 +156,15 @@ sem_wait_cleanup: call _Unwind_Resume@PLT hlt .LENDCODE: + cfi_endproc .size sem_wait_cleanup,.-sem_wait_cleanup .section .gcc_except_table,"a",@progbits .LexceptSTART: - .byte 0xff # @LPStart format (omit) - .byte 0xff # @TType format (omit) - .byte 0x01 # call-site format - # DW_EH_PE_uleb128 + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: .uleb128 .LcleanupSTART-.LSTARTCODE @@ -163,97 +178,6 @@ sem_wait_cleanup: .Lcstend: - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zPLR" # NUL-terminated augmentation - # string. -#else - .string "zPL" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 7 # Augmentation value length. - .byte 0x9b # Personality: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4 - # + DW_EH_PE_indirect - .long DW.ref.__gcc_personality_v0-. - .byte 0x1b # LSDA Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. - .byte 0x1b # FDE Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#else - .uleb128 10 # Augmentation value length. - .byte 0x0 # Personality: absolute - .quad __gcc_personality_v0 - .byte 0x0 # LSDA Encoding: absolute -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x10 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code. - .long .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 4 # Augmentation size - .long .LexceptSTART-. -#else - .quad .LSTARTCODE # Start address of the code. - .quad .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 8 # Augmentation size - .quad .LexceptSTART -#endif - - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r12-.LSTARTCODE - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r13-.Lpush_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r13-.Lpush_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0xcd # DW_CFA_restore %r13 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r12-.Lpop_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 0xcc # DW_CFA_restore %r12 - .byte 4 # DW_CFA_advance_loc4 - .long .Lafter_retq-.Lpop_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .align 8 -.LENDFDE: - - #ifdef SHARED .hidden DW.ref.__gcc_personality_v0 .weak DW.ref.__gcc_personality_v0 From 0adae4681750bea9eb729c8935d4b152de4a6b68 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 15:50:23 -0700 Subject: [PATCH 35/50] Optimize x86-64 sem_wait for uncontested semaphore. --- nptl/ChangeLog | 3 ++ .../sysdeps/unix/sysv/linux/x86_64/sem_wait.S | 51 ++++++++++--------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 5204ad10ca..5db47a138b 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,8 @@ 2009-07-17 Ulrich Drepper + * sysdeps/unix/sysv/linux/x86_64/sem_wait.S (sem_wait): Optimize + handling of uncontested semaphore. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (__condvar_cleanup): Rewrite to use cfi directives instead of hand-coded unwind tables. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S index de6a53b015..a01d745a17 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S @@ -40,18 +40,11 @@ sem_wait: cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) #endif - pushq %r12 - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%r12, 0) - pushq %r13 - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%r13, 0) - movq %rdi, %r13 #if VALUE == 0 - movl (%r13), %eax + movl (%rdi), %eax #else - movl VALUE(%r13), %eax + movl VALUE(%rdi), %eax #endif 2: testl %eax, %eax je 1f @@ -59,27 +52,24 @@ sem_wait: leal -1(%rax), %edx LOCK #if VALUE == 0 - cmpxchgl %edx, (%r13) + cmpxchgl %edx, (%rdi) #else - cmpxchgl %edx, VALUE(%r13) + cmpxchgl %edx, VALUE(%rdi) #endif jne 2b -7: xorl %eax, %eax - -9: popq %r13 - cfi_adjust_cfa_offset(-8) - cfi_restore(%r13) - popq %r12 - cfi_adjust_cfa_offset(-8) - cfi_restore(%r12) - + xorl %eax, %eax retq - cfi_adjust_cfa_offset(2 * 8) - cfi_rel_offset(%r12, 8) +1: pushq %r12 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) + pushq %r13 + cfi_adjust_cfa_offset(8) cfi_rel_offset(%r13, 0) -1: LOCK + movq %rdi, %r13 + + LOCK addq $1, NWAITERS(%r13) .LcleanupSTART: @@ -128,8 +118,21 @@ sem_wait: LOCK subq $1, NWAITERS(%r13) - jmp 7b + xorl %eax, %eax + +9: popq %r13 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) + popq %r12 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) + + retq + + cfi_adjust_cfa_offset(2 * 8) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) 4: negq %r12 #if USE___THREAD movq errno@gottpoff(%rip), %rdx From aee2665a1795c60aa1965387681d801cf3cda75c Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 16:26:06 -0700 Subject: [PATCH 36/50] Replace hand-coded unwind tables from x86-64 sem_timedwait. --- nptl/ChangeLog | 2 + .../unix/sysv/linux/x86_64/sem_timedwait.S | 151 ++++-------------- 2 files changed, 31 insertions(+), 122 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 5db47a138b..867493ced5 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -9,6 +9,8 @@ * sysdeps/unix/sysv/linux/x86_64/pthread_once.S (__pthread_once): Likewise. * sysdeps/unix/sysv/linux/x86_64/sem_wait.S (sem_wait): Likewise. + * sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait): + Likewise. 2009-06-12 Ulrich Drepper diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S index 88e99cf6a1..7d66d49608 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -34,6 +34,15 @@ .align 16 sem_timedwait: .LSTARTCODE: + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif #if VALUE == 0 movl (%rdi), %eax #else @@ -56,13 +65,16 @@ sem_timedwait: /* Check whether the timeout value is valid. */ 1: pushq %r12 -.Lpush_r12: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 -.Lpush_r13: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) pushq %r14 -.Lpush_r14: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r14, 0) subq $24, %rsp -.Lsubq: + cfi_adjust_cfa_offset(24) movq %rdi, %r12 movq %rsi, %r13 @@ -152,16 +164,22 @@ sem_timedwait: subq $1, NWAITERS(%r12) addq $24, %rsp -.Laddq: + cfi_adjust_cfa_offset(-24) popq %r14 -.Lpop_r14: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r14) popq %r13 -.Lpop_r13: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) popq %r12 -.Lpop_r12: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) retq -.Lafter_retq: + cfi_adjust_cfa_offset(3 * 8 + 24) + cfi_rel_offset(%r12, 24 + 2 * 8) + cfi_rel_offset(%r13, 24 + 1 * 8) + cfi_rel_offset(%r14, 24) 3: negq %r14 6: #if USE___THREAD @@ -186,6 +204,7 @@ sem_timedwait_cleanup: call _Unwind_Resume@PLT hlt .LENDCODE: + cfi_endproc .size sem_timedwait_cleanup,.-sem_timedwait_cleanup @@ -208,118 +227,6 @@ sem_timedwait_cleanup: .Lcstend: - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zPLR" # NUL-terminated augmentation - # string. -#else - .string "zPL" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 7 # Augmentation value length. - .byte 0x9b # Personality: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4 - # + DW_EH_PE_indirect - .long DW.ref.__gcc_personality_v0-. - .byte 0x1b # LSDA Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. - .byte 0x1b # FDE Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#else - .uleb128 10 # Augmentation value length. - .byte 0x0 # Personality: absolute - .quad __gcc_personality_v0 - .byte 0x0 # LSDA Encoding: absolute -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x10 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code. - .long .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 4 # Augmentation size - .long .LexceptSTART-. -#else - .quad .LSTARTCODE # Start address of the code. - .quad .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 8 # Augmentation size - .quad .LexceptSTART -#endif - - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r12-.LSTARTCODE - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r13-.Lpush_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r14-.Lpush_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 - .byte 0x8e # DW_CFA_offset %r14 - .uleb128 4 - .byte 4 # DW_CFA_advance_loc4 - .long .Lsubq-.Lpush_r14 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 56 - .byte 4 # DW_CFA_advance_loc4 - .long .Laddq-.Lsubq - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r14-.Laddq - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0xce # DW_CFA_restore %r14 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r13-.Lpop_r14 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0xcd # DW_CFA_restore %r13 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r12-.Lpop_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 0xcc # DW_CFA_restore %r12 - .byte 4 # DW_CFA_advance_loc4 - .long .Lafter_retq-.Lpop_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 56 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .byte 0x8e # DW_CFA_offset %r14 - .uleb128 4 - .align 8 -.LENDFDE: - - #ifdef SHARED .hidden DW.ref.__gcc_personality_v0 .weak DW.ref.__gcc_personality_v0 From 312d667b85fade523b05f73e0620fb5365fb68a7 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 16:39:27 -0700 Subject: [PATCH 37/50] Consistently use symbolic constants. --- nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S index 7d66d49608..c96b37375d 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S @@ -210,10 +210,9 @@ sem_timedwait_cleanup: .section .gcc_except_table,"a",@progbits .LexceptSTART: - .byte 0xff # @LPStart format (omit) - .byte 0xff # @TType format (omit) - .byte 0x01 # call-site format - # DW_EH_PE_uleb128 + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: .uleb128 .LcleanupSTART-.LSTARTCODE From f351f2b756e634fde2482a8f9790944db868162b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 17 Jul 2009 19:08:54 -0700 Subject: [PATCH 38/50] Extend x86-64 sem_timedwait to use futex syscall with absolute timeout. --- nptl/ChangeLog | 4 + .../unix/sysv/linux/x86_64/sem_timedwait.S | 162 +++++++++++++----- 2 files changed, 120 insertions(+), 46 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 867493ced5..98d3a4de68 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,9 @@ 2009-07-17 Ulrich Drepper + * sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait): + If possible use FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME to directly + use absolute timeout. + * sysdeps/unix/sysv/linux/x86_64/sem_wait.S (sem_wait): Optimize handling of uncontested semaphore. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S index c96b37375d..95762834d3 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S @@ -18,6 +18,7 @@ 02111-1307 USA. */ #include +#include #include #include #include @@ -73,8 +74,13 @@ sem_timedwait: pushq %r14 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r14, 0) - subq $24, %rsp - cfi_adjust_cfa_offset(24) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define STACKFRAME 8 +#else +# define STACKFRAME 24 +#endif + subq $STACKFRAME, %rsp + cfi_adjust_cfa_offset(STACKFRAME) movq %rdi, %r12 movq %rsi, %r13 @@ -87,67 +93,50 @@ sem_timedwait: LOCK addq $1, NWAITERS(%r12) -7: xorl %esi, %esi - movq %rsp, %rdi - movq $VSYSCALL_ADDR_vgettimeofday, %rax - callq *%rax - - /* Compute relative timeout. */ - movq 8(%rsp), %rax - movl $1000, %edi - mul %rdi /* Milli seconds to nano seconds. */ - movq (%r13), %rdi - movq 8(%r13), %rsi - subq (%rsp), %rdi - subq %rax, %rsi - jns 5f - addq $1000000000, %rsi - decq %rdi -5: testq %rdi, %rdi - movl $ETIMEDOUT, %r14d - js 6f /* Time is already up. */ - - movq %rdi, (%rsp) /* Store relative timeout. */ - movq %rsi, 8(%rsp) +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif .LcleanupSTART: - call __pthread_enable_asynccancel - movl %eax, 16(%rsp) +13: call __pthread_enable_asynccancel + movl %eax, (%rsp) - movq %rsp, %r10 + movq %r13, %r10 #if VALUE == 0 movq %r12, %rdi #else leaq VALUE(%r12), %rdi #endif -#if FUTEX_WAIT == 0 - movl PRIVATE(%rdi), %esi -#else - movl $FUTEX_WAIT, %esi + movl $0xffffffff, %r9d + movl $FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi orl PRIVATE(%rdi), %esi -#endif movl $SYS_futex, %eax xorl %edx, %edx syscall movq %rax, %r14 - movl 16(%rsp), %edi + movl (%rsp), %edi call __pthread_disable_asynccancel .LcleanupEND: testq %r14, %r14 - je 9f + je 11f cmpq $-EWOULDBLOCK, %r14 jne 3f -9: +11: #if VALUE == 0 movl (%r12), %eax #else movl VALUE(%r12), %eax #endif -8: testl %eax, %eax - je 7b +14: testl %eax, %eax + je 13b leaq -1(%rax), %rcx LOCK @@ -156,15 +145,15 @@ sem_timedwait: #else cmpxchgl %ecx, VALUE(%r12) #endif - jne 8b + jne 14b - xorl %eax, %eax +10: xorl %eax, %eax -10: LOCK +15: LOCK subq $1, NWAITERS(%r12) - addq $24, %rsp - cfi_adjust_cfa_offset(-24) + addq $STACKFRAME, %rsp + cfi_adjust_cfa_offset(-STACKFRAME) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) @@ -176,10 +165,10 @@ sem_timedwait: cfi_restore(%r12) retq - cfi_adjust_cfa_offset(3 * 8 + 24) - cfi_rel_offset(%r12, 24 + 2 * 8) - cfi_rel_offset(%r13, 24 + 1 * 8) - cfi_rel_offset(%r14, 24) + cfi_adjust_cfa_offset(STACKFRAME + 3 * 8) + cfi_rel_offset(%r12, STACKFRAME + 2 * 8) + cfi_rel_offset(%r13, STACKFRAME + 1 * 8) + cfi_rel_offset(%r14, STACKFRAME) 3: negq %r14 6: #if USE___THREAD @@ -191,7 +180,82 @@ sem_timedwait: #endif orl $-1, %eax + jmp 15b + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +.Lreltmo: +7: xorl %esi, %esi + movq %rsp, %rdi + movq $VSYSCALL_ADDR_vgettimeofday, %rax + callq *%rax + + /* Compute relative timeout. */ + movq 8(%rsp), %rax + movl $1000, %edi + mul %rdi /* Milli seconds to nano seconds. */ + movq (%r13), %rdi + movq 8(%r13), %rsi + subq (%rsp), %rdi + subq %rax, %rsi + jns 5f + addq $1000000000, %rsi + decq %rdi +5: testq %rdi, %rdi + movl $ETIMEDOUT, %r14d + js 6b /* Time is already up. */ + + movq %rdi, (%rsp) /* Store relative timeout. */ + movq %rsi, 8(%rsp) + +.LcleanupSTART2: + call __pthread_enable_asynccancel + movl %eax, 16(%rsp) + + movq %rsp, %r10 +# if VALUE == 0 + movq %r12, %rdi +# else + leaq VALUE(%r12), %rdi +# endif +# if FUTEX_WAIT == 0 + movl PRIVATE(%rdi), %esi +# else + movl $FUTEX_WAIT, %esi + orl PRIVATE(%rdi), %esi +# endif + movl $SYS_futex, %eax + xorl %edx, %edx + syscall + movq %rax, %r14 + + movl 16(%rsp), %edi + call __pthread_disable_asynccancel +.LcleanupEND2: + + testq %r14, %r14 + je 9f + cmpq $-EWOULDBLOCK, %r14 + jne 3b + +9: +# if VALUE == 0 + movl (%r12), %eax +# else + movl VALUE(%r12), %eax +# endif +8: testl %eax, %eax + je 7b + + leaq -1(%rax), %rcx + LOCK +# if VALUE == 0 + cmpxchgl %ecx, (%r12) +# else + cmpxchgl %ecx, VALUE(%r12) +# endif + jne 8b jmp 10b +#endif .size sem_timedwait,.-sem_timedwait @@ -219,6 +283,12 @@ sem_timedwait_cleanup: .uleb128 .LcleanupEND-.LcleanupSTART .uleb128 sem_timedwait_cleanup-.LSTARTCODE .uleb128 0 +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .uleb128 .LcleanupSTART2-.LSTARTCODE + .uleb128 .LcleanupEND2-.LcleanupSTART2 + .uleb128 sem_timedwait_cleanup-.LSTARTCODE + .uleb128 0 +#endif .uleb128 .LcallUR-.LSTARTCODE .uleb128 .LENDCODE-.LcallUR .uleb128 0 From 30b1954abb02aa7ba8136fe728820cd769052efb Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 08:09:39 -0700 Subject: [PATCH 39/50] Optimize x86-64 pthread_cond_wait. Instead of actively registering an unwind buffer we now use the exception handling functionality of the gcc runtime. --- nptl/ChangeLog | 6 + .../linux/x86_64/pthread_cond_timedwait.S | 125 +++++++ .../sysv/linux/x86_64/pthread_cond_wait.S | 328 ++++++++++-------- 3 files changed, 315 insertions(+), 144 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 98d3a4de68..33fc8a15e7 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,9 @@ +2009-07-18 Ulrich Drepper + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S + (__pthread_cond_wait): Convert to using exception handler instead of + registered unwind buffer. + 2009-07-17 Ulrich Drepper * sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait): diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index ddcf106a6d..2b535917cc 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -31,6 +31,131 @@ .text + + .align 16 + .type __condvar_cleanup, @function + .globl __condvar_cleanup + .hidden __condvar_cleanup +__condvar_cleanup: + cfi_startproc + pushq %r12 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) + + /* Get internal lock. */ + movq %rdi, %r8 + movq 8(%rdi), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +#if cond_lock == 0 + cmpxchgl %esi, (%rdi) +#else + cmpxchgl %esi, cond_lock(%rdi) +#endif + jz 1f + +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +#if cond_lock != 0 + subq $cond_lock, %rdi +#endif + +1: movl broadcast_seq(%rdi), %edx + cmpl 4(%r8), %edx + jne 3f + + /* We increment the wakeup_seq counter only if it is lower than + total_seq. If this is not the case the thread was woken and + then canceled. In this case we ignore the signal. */ + movq total_seq(%rdi), %rax + cmpq wakeup_seq(%rdi), %rax + jbe 6f + incq wakeup_seq(%rdi) + incl cond_futex(%rdi) +6: incq woken_seq(%rdi) + +3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Wake up a thread which wants to destroy the condvar object. */ + xorq %r12, %r12 + cmpq $0xffffffffffffffff, total_seq(%rdi) + jne 4f + movl cond_nwaiters(%rdi), %eax + andl $~((1 << nwaiters_shift) - 1), %eax + jne 4f + + addq $cond_nwaiters, %rdi + cmpq $-1, dep_mutex-cond_nwaiters(%rdi) + movl $1, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + subq $cond_nwaiters, %rdi + movl $1, %r12d + +4: LOCK +#if cond_lock == 0 + decl (%rdi) +#else + decl cond_lock(%rdi) +#endif + je 2f +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + + /* Wake up all waiters to make sure no signal gets lost. */ +2: testq %r12, %r12 + jnz 5f + addq $cond_futex, %rdi + cmpq $-1, dep_mutex-cond_futex(%rdi) + movl $0x7fffffff, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + +5: movq 16(%r8), %rdi + callq __pthread_mutex_cond_lock + + popq %r12 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) + + retq + cfi_endproc + .size __condvar_cleanup, .-__condvar_cleanup + + /* int pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime) */ .globl __pthread_cond_timedwait diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index 146a414d41..c3c879cde9 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -28,148 +28,32 @@ .text - .align 16 - .type __condvar_cleanup, @function - .globl __condvar_cleanup - .hidden __condvar_cleanup -__condvar_cleanup: - cfi_startproc - pushq %r12 - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%r12, 0) - - /* Get internal lock. */ - movq %rdi, %r8 - movq 8(%rdi), %rdi - movl $1, %esi - xorl %eax, %eax - LOCK -#if cond_lock == 0 - cmpxchgl %esi, (%rdi) -#else - cmpxchgl %esi, cond_lock(%rdi) -#endif - jz 1f - -#if cond_lock != 0 - addq $cond_lock, %rdi -#endif - cmpq $-1, dep_mutex-cond_lock(%rdi) - movl $LLL_PRIVATE, %eax - movl $LLL_SHARED, %esi - cmovne %eax, %esi - callq __lll_lock_wait -#if cond_lock != 0 - subq $cond_lock, %rdi -#endif - -1: movl broadcast_seq(%rdi), %edx - cmpl 4(%r8), %edx - jne 3f - - /* We increment the wakeup_seq counter only if it is lower than - total_seq. If this is not the case the thread was woken and - then canceled. In this case we ignore the signal. */ - movq total_seq(%rdi), %rax - cmpq wakeup_seq(%rdi), %rax - jbe 6f - incq wakeup_seq(%rdi) - incl cond_futex(%rdi) -6: incq woken_seq(%rdi) - -3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - - /* Wake up a thread which wants to destroy the condvar object. */ - xorq %r12, %r12 - cmpq $0xffffffffffffffff, total_seq(%rdi) - jne 4f - movl cond_nwaiters(%rdi), %eax - andl $~((1 << nwaiters_shift) - 1), %eax - jne 4f - - addq $cond_nwaiters, %rdi - cmpq $-1, dep_mutex-cond_nwaiters(%rdi) - movl $1, %edx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE, %eax - movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE, %esi -#endif - movl $SYS_futex, %eax - syscall - subq $cond_nwaiters, %rdi - movl $1, %r12d - -4: LOCK -#if cond_lock == 0 - decl (%rdi) -#else - decl cond_lock(%rdi) -#endif - je 2f -#if cond_lock != 0 - addq $cond_lock, %rdi -#endif - cmpq $-1, dep_mutex-cond_lock(%rdi) - movl $LLL_PRIVATE, %eax - movl $LLL_SHARED, %esi - cmovne %eax, %esi - callq __lll_unlock_wake - - /* Wake up all waiters to make sure no signal gets lost. */ -2: testq %r12, %r12 - jnz 5f - addq $cond_futex, %rdi - cmpq $-1, dep_mutex-cond_futex(%rdi) - movl $0x7fffffff, %edx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE, %eax - movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE, %esi -#endif - movl $SYS_futex, %eax - syscall - -5: movq 16(%r8), %rdi - callq __pthread_mutex_cond_lock - - popq %r12 - cfi_adjust_cfa_offset(-8) - cfi_restore(%r12) - - retq - cfi_endproc - .size __condvar_cleanup, .-__condvar_cleanup - - /* int pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex) */ .globl __pthread_cond_wait .type __pthread_cond_wait, @function .align 16 __pthread_cond_wait: +.LSTARTCODE: cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif + pushq %r12 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r12, 0) -#define FRAME_SIZE 64 +#define FRAME_SIZE 32 subq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(FRAME_SIZE) /* Stack frame: - rsp + 64 - +--------------------------+ - rsp + 32 | cleanup buffer | + rsp + 32 +--------------------------+ rsp + 24 | old wake_seq value | +--------------------------+ @@ -216,16 +100,6 @@ __pthread_cond_wait: incl cond_futex(%rdi) addl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - /* Install cancellation handler. */ -#ifdef PIC - leaq __condvar_cleanup(%rip), %rsi -#else - leaq __condvar_cleanup, %rsi -#endif - leaq 32(%rsp), %rdi - movq %rsp, %rdx - callq __pthread_cleanup_push - /* Get and store current wakeup_seq value. */ movq 8(%rsp), %rdi movq wakeup_seq(%rdi), %r9 @@ -243,6 +117,7 @@ __pthread_cond_wait: #endif jne 3f +.LcleanupSTART: 4: callq __pthread_enable_asynccancel movl %eax, (%rsp) @@ -268,6 +143,7 @@ __pthread_cond_wait: movl (%rsp), %edi callq __pthread_disable_asynccancel +.LcleanupEND: /* Lock. */ movq 8(%rsp), %rdi @@ -333,11 +209,7 @@ __pthread_cond_wait: #endif jne 10f - /* Remove cancellation handler. */ -11: movq 32+CLEANUP_PREV(%rsp), %rdx - movq %rdx, %fs:CLEANUP - - movq 16(%rsp), %rdi +11: movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock 14: addq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(-FRAME_SIZE) @@ -424,7 +296,175 @@ __pthread_cond_wait: 13: movq %r10, %rax jmp 14b - cfi_endproc .size __pthread_cond_wait, .-__pthread_cond_wait versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait, GLIBC_2_3_2) + + + .align 16 + .type __condvar_cleanup1, @function + .globl __condvar_cleanup1 + .hidden __condvar_cleanup1 +__condvar_cleanup1: + /* Stack frame: + + rsp + 40 + +--------------------------+ + rsp + 32 | %r12 | + +--------------------------+ + rsp + 24 | unused | + +--------------------------+ + rsp + 16 | mutex pointer | + +--------------------------+ + rsp + 8 | condvar pointer | + +--------------------------+ + rsp + 4 | old broadcast_seq value | + +--------------------------+ + rsp + 0 | old cancellation mode | + +--------------------------+ + */ + + movq %rax, 24(%rsp) + + /* Get internal lock. */ + movq 8(%rsp), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +#if cond_lock == 0 + cmpxchgl %esi, (%rdi) +#else + cmpxchgl %esi, cond_lock(%rdi) +#endif + jz 1f + +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +#if cond_lock != 0 + subq $cond_lock, %rdi +#endif + +1: movl broadcast_seq(%rdi), %edx + cmpl 4(%rsp), %edx + jne 3f + + /* We increment the wakeup_seq counter only if it is lower than + total_seq. If this is not the case the thread was woken and + then canceled. In this case we ignore the signal. */ + movq total_seq(%rdi), %rax + cmpq wakeup_seq(%rdi), %rax + jbe 6f + incq wakeup_seq(%rdi) + incl cond_futex(%rdi) +6: incq woken_seq(%rdi) + +3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Wake up a thread which wants to destroy the condvar object. */ + xorq %r12, %r12 + cmpq $0xffffffffffffffff, total_seq(%rdi) + jne 4f + movl cond_nwaiters(%rdi), %eax + andl $~((1 << nwaiters_shift) - 1), %eax + jne 4f + + cmpq $-1, dep_mutex(%rdi) + leaq cond_nwaiters(%rdi), %rdi + movl $1, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + subq $cond_nwaiters, %rdi + movl $1, %r12d + +4: LOCK +#if cond_lock == 0 + decl (%rdi) +#else + decl cond_lock(%rdi) +#endif + je 2f +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + + /* Wake up all waiters to make sure no signal gets lost. */ +2: testq %r12, %r12 + jnz 5f + addq $cond_futex, %rdi + cmpq $-1, dep_mutex-cond_futex(%rdi) + movl $0x7fffffff, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + +5: movq 16(%rsp), %rdi + callq __pthread_mutex_cond_lock + + movq 24(%rsp), %rdi + movq 32(%rsp), %r12 +.LcallUR: + call _Unwind_Resume@PLT + hlt +.LENDCODE: + cfi_endproc + .size __condvar_cleanup1, .-__condvar_cleanup1 + + + .section .gcc_except_table,"a",@progbits +.LexceptSTART: + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format + .uleb128 .Lcstend-.Lcstbegin +.Lcstbegin: + .uleb128 .LcleanupSTART-.LSTARTCODE + .uleb128 .LcleanupEND-.LcleanupSTART + .uleb128 __condvar_cleanup1-.LSTARTCODE + .uleb128 0 + .uleb128 .LcallUR-.LSTARTCODE + .uleb128 .LENDCODE-.LcallUR + .uleb128 0 + .uleb128 0 +.Lcstend: + + +#ifdef SHARED + .hidden DW.ref.__gcc_personality_v0 + .weak DW.ref.__gcc_personality_v0 + .section .gnu.linkonce.d.DW.ref.__gcc_personality_v0,"aw",@progbits + .align 8 + .type DW.ref.__gcc_personality_v0, @object + .size DW.ref.__gcc_personality_v0, 8 +DW.ref.__gcc_personality_v0: + .quad __gcc_personality_v0 +#endif From 92618c954ff3c729c490f93bd15e621930656a47 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 08:53:18 -0700 Subject: [PATCH 40/50] Optimize x86-64 pthread_cond_timedwait. Instead of actively registering an unwind buffer we now use the exception handling functionality of the gcc runtime. --- nptl/ChangeLog | 2 + .../linux/x86_64/pthread_cond_timedwait.S | 332 ++++++++++-------- 2 files changed, 190 insertions(+), 144 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 33fc8a15e7..1ee3b19078 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -3,6 +3,8 @@ * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (__pthread_cond_wait): Convert to using exception handler instead of registered unwind buffer. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S + (__pthread_cond_timedwait): Likewise. 2009-07-17 Ulrich Drepper diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index 2b535917cc..a2ebfec9c8 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -32,137 +32,23 @@ .text - .align 16 - .type __condvar_cleanup, @function - .globl __condvar_cleanup - .hidden __condvar_cleanup -__condvar_cleanup: - cfi_startproc - pushq %r12 - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%r12, 0) - - /* Get internal lock. */ - movq %rdi, %r8 - movq 8(%rdi), %rdi - movl $1, %esi - xorl %eax, %eax - LOCK -#if cond_lock == 0 - cmpxchgl %esi, (%rdi) -#else - cmpxchgl %esi, cond_lock(%rdi) -#endif - jz 1f - -#if cond_lock != 0 - addq $cond_lock, %rdi -#endif - cmpq $-1, dep_mutex-cond_lock(%rdi) - movl $LLL_PRIVATE, %eax - movl $LLL_SHARED, %esi - cmovne %eax, %esi - callq __lll_lock_wait -#if cond_lock != 0 - subq $cond_lock, %rdi -#endif - -1: movl broadcast_seq(%rdi), %edx - cmpl 4(%r8), %edx - jne 3f - - /* We increment the wakeup_seq counter only if it is lower than - total_seq. If this is not the case the thread was woken and - then canceled. In this case we ignore the signal. */ - movq total_seq(%rdi), %rax - cmpq wakeup_seq(%rdi), %rax - jbe 6f - incq wakeup_seq(%rdi) - incl cond_futex(%rdi) -6: incq woken_seq(%rdi) - -3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - - /* Wake up a thread which wants to destroy the condvar object. */ - xorq %r12, %r12 - cmpq $0xffffffffffffffff, total_seq(%rdi) - jne 4f - movl cond_nwaiters(%rdi), %eax - andl $~((1 << nwaiters_shift) - 1), %eax - jne 4f - - addq $cond_nwaiters, %rdi - cmpq $-1, dep_mutex-cond_nwaiters(%rdi) - movl $1, %edx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE, %eax - movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE, %esi -#endif - movl $SYS_futex, %eax - syscall - subq $cond_nwaiters, %rdi - movl $1, %r12d - -4: LOCK -#if cond_lock == 0 - decl (%rdi) -#else - decl cond_lock(%rdi) -#endif - je 2f -#if cond_lock != 0 - addq $cond_lock, %rdi -#endif - cmpq $-1, dep_mutex-cond_lock(%rdi) - movl $LLL_PRIVATE, %eax - movl $LLL_SHARED, %esi - cmovne %eax, %esi - callq __lll_unlock_wake - - /* Wake up all waiters to make sure no signal gets lost. */ -2: testq %r12, %r12 - jnz 5f - addq $cond_futex, %rdi - cmpq $-1, dep_mutex-cond_futex(%rdi) - movl $0x7fffffff, %edx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE, %eax - movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE, %esi -#endif - movl $SYS_futex, %eax - syscall - -5: movq 16(%r8), %rdi - callq __pthread_mutex_cond_lock - - popq %r12 - cfi_adjust_cfa_offset(-8) - cfi_restore(%r12) - - retq - cfi_endproc - .size __condvar_cleanup, .-__condvar_cleanup - - /* int pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime) */ .globl __pthread_cond_timedwait .type __pthread_cond_timedwait, @function .align 16 __pthread_cond_timedwait: +.LSTARTCODE: cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif + pushq %r12 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r12, 0) @@ -172,7 +58,7 @@ __pthread_cond_timedwait: pushq %r14 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r14, 0) -#define FRAME_SIZE 80 +#define FRAME_SIZE 48 subq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(FRAME_SIZE) @@ -182,9 +68,7 @@ __pthread_cond_timedwait: /* Stack frame: - rsp + 80 - +--------------------------+ - rsp + 48 | cleanup buffer | + rsp + 48 +--------------------------+ rsp + 40 | old wake_seq value | +--------------------------+ @@ -234,16 +118,6 @@ __pthread_cond_timedwait: incl cond_futex(%rdi) addl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - /* Install cancellation handler. */ -#ifdef PIC - leaq __condvar_cleanup(%rip), %rsi -#else - leaq __condvar_cleanup, %rsi -#endif - leaq 48(%rsp), %rdi - movq %rsp, %rdx - callq __pthread_cleanup_push - /* Get and store current wakeup_seq value. */ movq 8(%rsp), %rdi movq wakeup_seq(%rdi), %r9 @@ -321,6 +195,7 @@ __pthread_cond_timedwait: #endif jne 3f +.LcleanupSTART: 4: callq __pthread_enable_asynccancel movl %eax, (%rsp) @@ -346,6 +221,7 @@ __pthread_cond_timedwait: movl (%rsp), %edi callq __pthread_disable_asynccancel +.LcleanupEND: /* Lock. */ movq 8(%rsp), %rdi @@ -422,11 +298,7 @@ __pthread_cond_timedwait: #endif jne 10f - /* Remove cancellation handler. */ -11: movq 48+CLEANUP_PREV(%rsp), %rdx - movq %rdx, %fs:CLEANUP - - movq 16(%rsp), %rdi +11: movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock testq %rax, %rax @@ -548,7 +420,179 @@ __pthread_cond_timedwait: js 6b jmp 21b #endif - cfi_endproc .size __pthread_cond_timedwait, .-__pthread_cond_timedwait versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait, GLIBC_2_3_2) + + + .align 16 + .type __condvar_cleanup2, @function +__condvar_cleanup2: + /* Stack frame: + + rsp + 72 + +--------------------------+ + rsp + 64 | %r12 | + +--------------------------+ + rsp + 56 | %r13 | + +--------------------------+ + rsp + 48 | %r14 | + +--------------------------+ + rsp + 24 | unused | + +--------------------------+ + rsp + 16 | mutex pointer | + +--------------------------+ + rsp + 8 | condvar pointer | + +--------------------------+ + rsp + 4 | old broadcast_seq value | + +--------------------------+ + rsp + 0 | old cancellation mode | + +--------------------------+ + */ + + movq %rax, 24(%rsp) + + /* Get internal lock. */ + movq 8(%rsp), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +#if cond_lock == 0 + cmpxchgl %esi, (%rdi) +#else + cmpxchgl %esi, cond_lock(%rdi) +#endif + jz 1f + +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +#if cond_lock != 0 + subq $cond_lock, %rdi +#endif + +1: movl broadcast_seq(%rdi), %edx + cmpl 4(%rsp), %edx + jne 3f + + /* We increment the wakeup_seq counter only if it is lower than + total_seq. If this is not the case the thread was woken and + then canceled. In this case we ignore the signal. */ + movq total_seq(%rdi), %rax + cmpq wakeup_seq(%rdi), %rax + jbe 6f + incq wakeup_seq(%rdi) + incl cond_futex(%rdi) +6: incq woken_seq(%rdi) + +3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Wake up a thread which wants to destroy the condvar object. */ + xorq %r12, %r12 + cmpq $0xffffffffffffffff, total_seq(%rdi) + jne 4f + movl cond_nwaiters(%rdi), %eax + andl $~((1 << nwaiters_shift) - 1), %eax + jne 4f + + cmpq $-1, dep_mutex(%rdi) + leaq cond_nwaiters(%rdi), %rdi + movl $1, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + subq $cond_nwaiters, %rdi + movl $1, %r12d + +4: LOCK +#if cond_lock == 0 + decl (%rdi) +#else + decl cond_lock(%rdi) +#endif + je 2f +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + + /* Wake up all waiters to make sure no signal gets lost. */ +2: testq %r12, %r12 + jnz 5f + addq $cond_futex, %rdi + cmpq $-1, dep_mutex-cond_futex(%rdi) + movl $0x7fffffff, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + +5: movq 16(%rsp), %rdi + callq __pthread_mutex_cond_lock + + movq 24(%rsp), %rdi + movq FRAME_SIZE(%rsp), %r14 + movq FRAME_SIZE+8(%rsp), %r13 + movq FRAME_SIZE+16(%rsp), %r12 +.LcallUR: + call _Unwind_Resume@PLT + hlt +.LENDCODE: + cfi_endproc + .size __condvar_cleanup2, .-__condvar_cleanup2 + + + .section .gcc_except_table,"a",@progbits +.LexceptSTART: + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format + .uleb128 .Lcstend-.Lcstbegin +.Lcstbegin: + .uleb128 .LcleanupSTART-.LSTARTCODE + .uleb128 .LcleanupEND-.LcleanupSTART + .uleb128 __condvar_cleanup2-.LSTARTCODE + .uleb128 0 + .uleb128 .LcallUR-.LSTARTCODE + .uleb128 .LENDCODE-.LcallUR + .uleb128 0 + .uleb128 0 +.Lcstend: + + +#ifdef SHARED + .hidden DW.ref.__gcc_personality_v0 + .weak DW.ref.__gcc_personality_v0 + .section .gnu.linkonce.d.DW.ref.__gcc_personality_v0,"aw",@progbits + .align 8 + .type DW.ref.__gcc_personality_v0, @object + .size DW.ref.__gcc_personality_v0, 8 +DW.ref.__gcc_personality_v0: + .quad __gcc_personality_v0 +#endif From e88726b483a275824e852f64476087568dbae7bb Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 12:44:12 -0700 Subject: [PATCH 41/50] Extend x86-64 pthread_cond_timedwait to use futex syscall with absolute timeout. --- nptl/ChangeLog | 4 + .../linux/x86_64/pthread_cond_timedwait.S | 418 ++++++++++++------ 2 files changed, 296 insertions(+), 126 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 1ee3b19078..c7e1d048be 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,9 @@ 2009-07-18 Ulrich Drepper + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S + (__pthread_cond_timedwait): If possible use FUTEX_WAIT_BITSET to + directly use absolute timeout. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (__pthread_cond_wait): Convert to using exception handler instead of registered unwind buffer. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index a2ebfec9c8..21115fddec 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -58,21 +58,25 @@ __pthread_cond_timedwait: pushq %r14 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r14, 0) -#define FRAME_SIZE 48 +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define FRAME_SIZE 32 +#else +# define FRAME_SIZE 48 +#endif subq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(FRAME_SIZE) cmpq $1000000000, 8(%rdx) movl $EINVAL, %eax - jae 18f + jae 48f /* Stack frame: rsp + 48 +--------------------------+ - rsp + 40 | old wake_seq value | + rsp + 32 | timeout value | +--------------------------+ - rsp + 24 | timeout value | + rsp + 24 | old wake_seq value | +--------------------------+ rsp + 16 | mutex pointer | +--------------------------+ @@ -94,8 +98,18 @@ __pthread_cond_timedwait: je 22f movq %rsi, dep_mutex(%rdi) +22: +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif + /* Get internal lock. */ -22: movl $1, %esi + movl $1, %esi xorl %eax, %eax LOCK #if cond_lock == 0 @@ -103,15 +117,15 @@ __pthread_cond_timedwait: #else cmpxchgl %esi, cond_lock(%rdi) #endif - jnz 1f + jnz 31f /* Unlock the mutex. */ -2: movq 16(%rsp), %rdi +32: movq 16(%rsp), %rdi xorl %esi, %esi callq __pthread_mutex_unlock_usercnt testl %eax, %eax - jne 16f + jne 46f movq 8(%rsp), %rdi incq total_seq(%rdi) @@ -122,69 +136,10 @@ __pthread_cond_timedwait: movq 8(%rsp), %rdi movq wakeup_seq(%rdi), %r9 movl broadcast_seq(%rdi), %edx - movq %r9, 40(%rsp) + movq %r9, 24(%rsp) movl %edx, 4(%rsp) - /* Get the current time. */ -8: -#ifdef __NR_clock_gettime - /* Get the clock number. Note that the field in the condvar - structure stores the number minus 1. */ - movq 8(%rsp), %rdi - movl cond_nwaiters(%rdi), %edi - andl $((1 << nwaiters_shift) - 1), %edi - /* Only clocks 0 and 1 are allowed so far. Both are handled in the - kernel. */ - leaq 24(%rsp), %rsi -# ifdef SHARED - movq __vdso_clock_gettime@GOTPCREL(%rip), %rax - movq (%rax), %rax - PTR_DEMANGLE (%rax) - jz 26f - call *%rax - jmp 27f -# endif -26: movl $__NR_clock_gettime, %eax - syscall -27: -# ifndef __ASSUME_POSIX_TIMERS - cmpq $-ENOSYS, %rax - je 19f -# endif - - /* Compute relative timeout. */ - movq (%r13), %rcx - movq 8(%r13), %rdx - subq 24(%rsp), %rcx - subq 32(%rsp), %rdx -#else - leaq 24(%rsp), %rdi - xorl %esi, %esi - movq $VSYSCALL_ADDR_vgettimeofday, %rax - callq *%rax - - /* Compute relative timeout. */ - movq 32(%rsp), %rax - movl $1000, %edx - mul %rdx /* Milli seconds to nano seconds. */ - movq (%r13), %rcx - movq 8(%r13), %rdx - subq 24(%rsp), %rcx - subq %rax, %rdx -#endif - jns 12f - addq $1000000000, %rdx - decq %rcx -12: testq %rcx, %rcx - movq 8(%rsp), %rdi - movq $-ETIMEDOUT, %r14 - js 6f - - /* Store relative timeout. */ -21: movq %rcx, 24(%rsp) - movq %rdx, 32(%rsp) - - movl cond_futex(%rdi), %r12d +38: movl cond_futex(%rdi), %r12d /* Unlock. */ LOCK @@ -193,27 +148,26 @@ __pthread_cond_timedwait: #else decl cond_lock(%rdi) #endif - jne 3f + jne 33f -.LcleanupSTART: -4: callq __pthread_enable_asynccancel +.LcleanupSTART1: +34: callq __pthread_enable_asynccancel movl %eax, (%rsp) - leaq 24(%rsp), %r10 + movq %r13, %r10 cmpq $-1, dep_mutex(%rdi) - movq %r12, %rdx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAIT, %eax - movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi + movl $FUTEX_WAIT_BITSET, %eax + movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi -# if FUTEX_WAIT != 0 - orl $FUTEX_WAIT, %esi -# endif -#endif + /* The following only works like this because we only support + two clocks, represented using a single bit. */ + xorl %eax, %eax + testl $1, cond_nwaiters(%rdi) + movl $FUTEX_CLOCK_REALTIME, %edx + movl $0xffffffff, %r9d + cmove %edx, %eax + orl %eax, %esi + movq %r12, %rdx addq $cond_futex, %rdi movl $SYS_futex, %eax syscall @@ -221,7 +175,7 @@ __pthread_cond_timedwait: movl (%rsp), %edi callq __pthread_disable_asynccancel -.LcleanupEND: +.LcleanupEND1: /* Lock. */ movq 8(%rsp), %rdi @@ -233,45 +187,45 @@ __pthread_cond_timedwait: #else cmpxchgl %esi, cond_lock(%rdi) #endif - jne 5f + jne 35f -6: movl broadcast_seq(%rdi), %edx +36: movl broadcast_seq(%rdi), %edx movq woken_seq(%rdi), %rax movq wakeup_seq(%rdi), %r9 cmpl 4(%rsp), %edx - jne 23f + jne 53f - cmpq 40(%rsp), %r9 - jbe 15f + cmpq 24(%rsp), %r9 + jbe 45f cmpq %rax, %r9 - ja 9f + ja 39f -15: cmpq $-ETIMEDOUT, %r14 - jne 8b +45: cmpq $-ETIMEDOUT, %r14 + jne 38b -13: incq wakeup_seq(%rdi) +99: incq wakeup_seq(%rdi) incl cond_futex(%rdi) movl $ETIMEDOUT, %r14d - jmp 14f + jmp 44f -23: xorq %r14, %r14 - jmp 24f +53: xorq %r14, %r14 + jmp 54f -9: xorq %r14, %r14 -14: incq woken_seq(%rdi) +39: xorq %r14, %r14 +44: incq woken_seq(%rdi) -24: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) +54: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) /* Wake up a thread which wants to destroy the condvar object. */ cmpq $0xffffffffffffffff, total_seq(%rdi) - jne 25f + jne 55f movl cond_nwaiters(%rdi), %eax andl $~((1 << nwaiters_shift) - 1), %eax - jne 25f + jne 55f addq $cond_nwaiters, %rdi cmpq $-1, dep_mutex-cond_nwaiters(%rdi) @@ -290,21 +244,21 @@ __pthread_cond_timedwait: syscall subq $cond_nwaiters, %rdi -25: LOCK +55: LOCK #if cond_lock == 0 decl (%rdi) #else decl cond_lock(%rdi) #endif - jne 10f + jne 40f -11: movq 16(%rsp), %rdi +41: movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock testq %rax, %rax cmoveq %r14, %rax -18: addq $FRAME_SIZE, %rsp +48: addq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(-FRAME_SIZE) popq %r14 cfi_adjust_cfa_offset(-8) @@ -319,8 +273,7 @@ __pthread_cond_timedwait: retq /* Initial locking failed. */ -1: - cfi_adjust_cfa_offset(3 * 8 + FRAME_SIZE) +31: cfi_adjust_cfa_offset(3 * 8 + FRAME_SIZE) cfi_rel_offset(%r12, FRAME_SIZE + 16) cfi_rel_offset(%r13, FRAME_SIZE + 8) cfi_rel_offset(%r14, FRAME_SIZE) @@ -332,10 +285,10 @@ __pthread_cond_timedwait: movl $LLL_SHARED, %esi cmovne %eax, %esi callq __lll_lock_wait - jmp 2b + jmp 32b /* Unlock in loop requires wakeup. */ -3: +33: #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -344,10 +297,10 @@ __pthread_cond_timedwait: movl $LLL_SHARED, %esi cmovne %eax, %esi callq __lll_unlock_wake - jmp 4b + jmp 34b /* Locking in loop failed. */ -5: +35: #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -359,10 +312,10 @@ __pthread_cond_timedwait: #if cond_lock != 0 subq $cond_lock, %rdi #endif - jmp 6b + jmp 36b /* Unlock after loop requires wakeup. */ -10: +40: #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -371,10 +324,10 @@ __pthread_cond_timedwait: movl $LLL_SHARED, %esi cmovne %eax, %esi callq __lll_unlock_wake - jmp 11b + jmp 41b /* The initial unlocking of the mutex failed. */ -16: movq 8(%rsp), %rdi +46: movq 8(%rsp), %rdi movq %rax, (%rsp) LOCK #if cond_lock == 0 @@ -382,7 +335,7 @@ __pthread_cond_timedwait: #else decl cond_lock(%rdi) #endif - jne 17f + jne 47f #if cond_lock != 0 addq $cond_lock, %rdi @@ -393,23 +346,229 @@ __pthread_cond_timedwait: cmovne %eax, %esi callq __lll_unlock_wake -17: movq (%rsp), %rax - jmp 18b +47: movq (%rsp), %rax + jmp 48b + + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +.Lreltmo: + /* Get internal lock. */ + movl $1, %esi + xorl %eax, %eax + LOCK +# if cond_lock == 0 + cmpxchgl %esi, (%rdi) +# else + cmpxchgl %esi, cond_lock(%rdi) +# endif + jnz 1f + + /* Unlock the mutex. */ +2: movq 16(%rsp), %rdi + xorl %esi, %esi + callq __pthread_mutex_unlock_usercnt + + testl %eax, %eax + jne 46b + + movq 8(%rsp), %rdi + incq total_seq(%rdi) + incl cond_futex(%rdi) + addl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Get and store current wakeup_seq value. */ + movq 8(%rsp), %rdi + movq wakeup_seq(%rdi), %r9 + movl broadcast_seq(%rdi), %edx + movq %r9, 24(%rsp) + movl %edx, 4(%rsp) -#if defined __NR_clock_gettime && !defined __ASSUME_POSIX_TIMERS + /* Get the current time. */ +8: +# ifdef __NR_clock_gettime + /* Get the clock number. Note that the field in the condvar + structure stores the number minus 1. */ + movq 8(%rsp), %rdi + movl cond_nwaiters(%rdi), %edi + andl $((1 << nwaiters_shift) - 1), %edi + /* Only clocks 0 and 1 are allowed so far. Both are handled in the + kernel. */ + leaq 32(%rsp), %rsi +# ifdef SHARED + movq __vdso_clock_gettime@GOTPCREL(%rip), %rax + movq (%rax), %rax + PTR_DEMANGLE (%rax) + jz 26f + call *%rax + jmp 27f +# endif +26: movl $__NR_clock_gettime, %eax + syscall +27: +# ifndef __ASSUME_POSIX_TIMERS + cmpq $-ENOSYS, %rax + je 19f +# endif + + /* Compute relative timeout. */ + movq (%r13), %rcx + movq 8(%r13), %rdx + subq 32(%rsp), %rcx + subq 40(%rsp), %rdx +# else + leaq 24(%rsp), %rdi + xorl %esi, %esi + movq $VSYSCALL_ADDR_vgettimeofday, %rax + callq *%rax + + /* Compute relative timeout. */ + movq 40(%rsp), %rax + movl $1000, %edx + mul %rdx /* Milli seconds to nano seconds. */ + movq (%r13), %rcx + movq 8(%r13), %rdx + subq 32(%rsp), %rcx + subq %rax, %rdx +# endif + jns 12f + addq $1000000000, %rdx + decq %rcx +12: testq %rcx, %rcx + movq 8(%rsp), %rdi + movq $-ETIMEDOUT, %r14 + js 6f + + /* Store relative timeout. */ +21: movq %rcx, 32(%rsp) + movq %rdx, 40(%rsp) + + movl cond_futex(%rdi), %r12d + + /* Unlock. */ + LOCK +# if cond_lock == 0 + decl (%rdi) +# else + decl cond_lock(%rdi) +# endif + jne 3f + +.LcleanupSTART2: +4: callq __pthread_enable_asynccancel + movl %eax, (%rsp) + + leaq 32(%rsp), %r10 + cmpq $-1, dep_mutex(%rdi) + movq %r12, %rdx +# ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAIT, %eax + movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +# else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi +# if FUTEX_WAIT != 0 + orl $FUTEX_WAIT, %esi +# endif +# endif + addq $cond_futex, %rdi + movl $SYS_futex, %eax + syscall + movq %rax, %r14 + + movl (%rsp), %edi + callq __pthread_disable_asynccancel +.LcleanupEND2: + + /* Lock. */ + movq 8(%rsp), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +# if cond_lock == 0 + cmpxchgl %esi, (%rdi) +# else + cmpxchgl %esi, cond_lock(%rdi) +# endif + jne 5f + +6: movl broadcast_seq(%rdi), %edx + + movq woken_seq(%rdi), %rax + + movq wakeup_seq(%rdi), %r9 + + cmpl 4(%rsp), %edx + jne 53b + + cmpq 24(%rsp), %r9 + jbe 45b + + cmpq %rax, %r9 + ja 39b + + cmpq $-ETIMEDOUT, %r14 + jne 8b + + jmp 99b + + /* Initial locking failed. */ +1: cfi_adjust_cfa_offset(3 * 8 + FRAME_SIZE) + cfi_rel_offset(%r12, FRAME_SIZE + 16) + cfi_rel_offset(%r13, FRAME_SIZE + 8) + cfi_rel_offset(%r14, FRAME_SIZE) +# if cond_lock != 0 + addq $cond_lock, %rdi +# endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait + jmp 2b + + /* Unlock in loop requires wakeup. */ +3: +# if cond_lock != 0 + addq $cond_lock, %rdi +# endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + jmp 4b + + /* Locking in loop failed. */ +5: +# if cond_lock != 0 + addq $cond_lock, %rdi +# endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +# if cond_lock != 0 + subq $cond_lock, %rdi +# endif + jmp 6b + +# if defined __NR_clock_gettime && !defined __ASSUME_POSIX_TIMERS /* clock_gettime not available. */ -19: leaq 24(%rsp), %rdi +19: leaq 32(%rsp), %rdi xorl %esi, %esi movq $VSYSCALL_ADDR_vgettimeofday, %rax callq *%rax /* Compute relative timeout. */ - movq 32(%rsp), %rax + movq 40(%rsp), %rax movl $1000, %edx mul %rdx /* Milli seconds to nano seconds. */ movq (%r13), %rcx movq 8(%r13), %rdx - subq 24(%rsp), %rcx + subq 32(%rsp), %rcx subq %rax, %rdx jns 20f addq $1000000000, %rdx @@ -419,6 +578,7 @@ __pthread_cond_timedwait: movq $-ETIMEDOUT, %r14 js 6b jmp 21b +# endif #endif .size __pthread_cond_timedwait, .-__pthread_cond_timedwait versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait, @@ -575,10 +735,16 @@ __condvar_cleanup2: .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: - .uleb128 .LcleanupSTART-.LSTARTCODE - .uleb128 .LcleanupEND-.LcleanupSTART + .uleb128 .LcleanupSTART1-.LSTARTCODE + .uleb128 .LcleanupEND1-.LcleanupSTART1 .uleb128 __condvar_cleanup2-.LSTARTCODE .uleb128 0 +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .uleb128 .LcleanupSTART2-.LSTARTCODE + .uleb128 .LcleanupEND2-.LcleanupSTART2 + .uleb128 __condvar_cleanup2-.LSTARTCODE + .uleb128 0 +#endif .uleb128 .LcallUR-.LSTARTCODE .uleb128 .LENDCODE-.LcallUR .uleb128 0 From f8b6cd2182494252fdddafe72331b631afac08d8 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 12:45:27 -0700 Subject: [PATCH 42/50] Extend pthread_cond_timedwait tests. --- nptl/ChangeLog | 3 +++ nptl/tst-cond11.c | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index c7e1d048be..d5b812e6f9 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,8 @@ 2009-07-18 Ulrich Drepper + * tst-cond11.c (run_test): Add test to check that the timeout is + long enough. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (__pthread_cond_timedwait): If possible use FUTEX_WAIT_BITSET to directly use absolute timeout. diff --git a/nptl/tst-cond11.c b/nptl/tst-cond11.c index 0de4d56137..4d0c7dd225 100644 --- a/nptl/tst-cond11.c +++ b/nptl/tst-cond11.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004 Free Software Foundation, Inc. +/* Copyright (C) 2003, 2004, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2003. @@ -130,6 +130,20 @@ run_test (clockid_t cl) return 1; } + struct timespec ts2; + if (clock_gettime (cl, &ts2) != 0) + { + puts ("second clock_gettime failed"); + return 1; + } + + if (ts2.tv_sec < ts.tv_sec + || (ts2.tv_sec == ts.tv_sec && ts2.tv_nsec < ts.tv_nsec)) + { + puts ("timeout too short"); + return 1; + } + if (pthread_mutex_unlock (&mut) != 0) { puts ("mutex_unlock failed"); From d9201c13656dc73fba9c5f5f96c3d0e2c7971218 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 13:08:21 -0700 Subject: [PATCH 43/50] Remove leftover cfi. --- nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index 21115fddec..45a9a4213b 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -514,10 +514,7 @@ __pthread_cond_timedwait: jmp 99b /* Initial locking failed. */ -1: cfi_adjust_cfa_offset(3 * 8 + FRAME_SIZE) - cfi_rel_offset(%r12, FRAME_SIZE + 16) - cfi_rel_offset(%r13, FRAME_SIZE + 8) - cfi_rel_offset(%r14, FRAME_SIZE) +1: # if cond_lock != 0 addq $cond_lock, %rdi # endif From d979611eb9f18ead1b8da3e956b941545f682565 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 21:35:33 -0700 Subject: [PATCH 44/50] Extend x86-64 pthread_rwlock_timedwrlock to use futex syscall with absolute timeout. --- nptl/ChangeLog | 4 ++ .../linux/x86_64/pthread_rwlock_timedwrlock.S | 61 +++++++++++++++---- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index d5b812e6f9..719d781d3a 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,9 @@ 2009-07-18 Ulrich Drepper + * sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S + (pthread_rwlock_timedwrlock): If possible use FUTEX_WAIT_BITSET to + directly use absolute timeout. + * tst-cond11.c (run_test): Add test to check that the timeout is long enough. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S index dde6b58836..bfc653da29 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -36,16 +36,21 @@ pthread_rwlock_timedwrlock: cfi_startproc pushq %r12 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define VALREG %edx +#else pushq %r14 cfi_adjust_cfa_offset(8) - cfi_offset(%r12, -16) - cfi_offset(%r13, -24) - cfi_offset(%r14, -32) + cfi_rel_offset(%r14, 0) subq $16, %rsp cfi_adjust_cfa_offset(16) +# define VALREG %r14d +#endif movq %rdi, %r12 movq %rsi, %r13 @@ -74,7 +79,7 @@ pthread_rwlock_timedwrlock: incl WRITERS_QUEUED(%r12) je 4f - movl WRITERS_WAKEUP(%r12), %r14d + movl WRITERS_WAKEUP(%r12), VALREG LOCK #if MUTEX == 0 @@ -84,8 +89,33 @@ pthread_rwlock_timedwrlock: #endif jne 10f +11: +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif + + movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi + xorl PSHARED(%r12), %esi + movq %r13, %r10 + movl $0xffffffff, %r9d +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + movl %r14d, %edx +#endif +21: leaq WRITERS_WAKEUP(%r12), %rdi + movl $SYS_futex, %eax + syscall + movq %rax, %rdx + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .subsection 2 +.Lreltmo: /* Get current time. */ -11: movq %rsp, %rdi + movq %rsp, %rdi xorl %esi, %esi movq $VSYSCALL_ADDR_vgettimeofday, %rax callq *%rax @@ -122,13 +152,12 @@ pthread_rwlock_timedwrlock: #endif movq %rsp, %r10 movl %r14d, %edx - leaq WRITERS_WAKEUP(%r12), %rdi - movl $SYS_futex, %eax - syscall - movq %rax, %rdx -17: - /* Reget the lock. */ + jmp 21b + .previous +#endif + +17: /* Reget the lock. */ movl $1, %esi xorl %eax, %eax LOCK @@ -160,11 +189,13 @@ pthread_rwlock_timedwrlock: 7: movq %rdx, %rax +#ifndef __ASSUME_PRIVATE_FUTEX addq $16, %rsp cfi_adjust_cfa_offset(-16) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) +#endif popq %r13 cfi_adjust_cfa_offset(-8) cfi_restore(%r13) @@ -173,10 +204,16 @@ pthread_rwlock_timedwrlock: cfi_restore(%r12) retq +#ifdef __ASSUME_PRIVATE_FUTEX + cfi_adjust_cfa_offset(16) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) +#else cfi_adjust_cfa_offset(40) cfi_offset(%r12, -16) cfi_offset(%r13, -24) cfi_offset(%r14, -32) +#endif 1: movl PSHARED(%rdi), %esi #if MUTEX != 0 addq $MUTEX, %rdi From 4c74e6522a9e6e9c0ae921e3150bf735cfeb67b0 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 21:41:52 -0700 Subject: [PATCH 45/50] Pretty printing last change. --- .../sysv/linux/x86_64/pthread_rwlock_timedwrlock.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S index bfc653da29..cd867b60dc 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S @@ -138,18 +138,18 @@ pthread_rwlock_timedwrlock: movq %rcx, (%rsp) /* Store relative timeout. */ movq %rdi, 8(%rsp) -#ifdef __ASSUME_PRIVATE_FUTEX +# ifdef __ASSUME_PRIVATE_FUTEX movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT, %esi xorl PSHARED(%r12), %esi -#else -# if FUTEX_WAIT == 0 - movl PSHARED(%r12), %esi # else +# if FUTEX_WAIT == 0 + movl PSHARED(%r12), %esi +# else movl $FUTEX_WAIT, %esi orl PSHARED(%r12), %esi -# endif +# endif xorl %fs:PRIVATE_FUTEX, %esi -#endif +# endif movq %rsp, %r10 movl %r14d, %edx From 32c6c342b6bc10396785a4542c22f6f95deca684 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 21:53:26 -0700 Subject: [PATCH 46/50] Extend x86-64 pthread_rwlock_timedrdlock to use futex syscall with absolute timeout. --- nptl/ChangeLog | 2 + .../linux/x86_64/pthread_rwlock_timedrdlock.S | 73 ++++++++++++++----- 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 719d781d3a..44f9846365 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -3,6 +3,8 @@ * sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S (pthread_rwlock_timedwrlock): If possible use FUTEX_WAIT_BITSET to directly use absolute timeout. + * sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S + (pthread_rwlock_timedrdlock): Likewise. * tst-cond11.c (run_test): Add test to check that the timeout is long enough. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S index 366c96fc36..23b218af34 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002-2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -36,16 +36,21 @@ pthread_rwlock_timedrdlock: cfi_startproc pushq %r12 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define VALREG %edx +#else pushq %r14 cfi_adjust_cfa_offset(8) - cfi_offset(%r12, -16) - cfi_offset(%r13, -24) - cfi_offset(%r14, -32) + cfi_rel_offset(%r14, 0) subq $16, %rsp cfi_adjust_cfa_offset(16) +# define VALREG %r14d +#endif movq %rdi, %r12 movq %rsi, %r13 @@ -76,7 +81,7 @@ pthread_rwlock_timedrdlock: incl READERS_QUEUED(%r12) je 4f - movl READERS_WAKEUP(%r12), %r14d + movl READERS_WAKEUP(%r12), VALREG /* Unlock. */ LOCK @@ -87,8 +92,33 @@ pthread_rwlock_timedrdlock: #endif jne 10f +11: +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif + + movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi + xorl PSHARED(%r12), %esi + movq %r13, %r10 + movl $0xffffffff, %r9d +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + movl %r14d, %edx +#endif +21: leaq READERS_WAKEUP(%r12), %rdi + movl $SYS_futex, %eax + syscall + movq %rax, %rdx + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .subsection 2 +.Lreltmo: /* Get current time. */ -11: movq %rsp, %rdi + movq %rsp, %rdi xorl %esi, %esi movq $VSYSCALL_ADDR_vgettimeofday, %rax callq *%rax @@ -111,27 +141,26 @@ pthread_rwlock_timedrdlock: movq %rcx, (%rsp) /* Store relative timeout. */ movq %rdi, 8(%rsp) -#ifdef __ASSUME_PRIVATE_FUTEX +# ifdef __ASSUME_PRIVATE_FUTEX movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT, %esi xorl PSHARED(%r12), %esi -#else -# if FUTEX_WAIT == 0 - movl PSHARED(%r12), %esi # else +# if FUTEX_WAIT == 0 + movl PSHARED(%r12), %esi +# else movl $FUTEX_WAIT, %esi orl PSHARED(%r12), %esi -# endif +# endif xorl %fs:PRIVATE_FUTEX, %esi -#endif +# endif movq %rsp, %r10 movl %r14d, %edx - leaq READERS_WAKEUP(%r12), %rdi - movl $SYS_futex, %eax - syscall - movq %rax, %rdx -17: - /* Reget the lock. */ + jmp 21b + .previous +#endif + +17: /* Reget the lock. */ movl $1, %esi xorl %eax, %eax LOCK @@ -163,11 +192,13 @@ pthread_rwlock_timedrdlock: 7: movq %rdx, %rax +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME addq $16, %rsp cfi_adjust_cfa_offset(-16) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) +#endif popq %r13 cfi_adjust_cfa_offset(-8) cfi_restore(%r13) @@ -176,10 +207,16 @@ pthread_rwlock_timedrdlock: cfi_restore(%r12) retq +#ifdef __ASSUME_PRIVATE_FUTEX + cfi_adjust_cfa_offset(16) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) +#else cfi_adjust_cfa_offset(40) cfi_offset(%r12, -16) cfi_offset(%r13, -24) cfi_offset(%r14, -32) +#endif 1: movl PSHARED(%rdi), %esi #if MUTEX != 0 addq $MUTEX, %rdi From 3d77b2687f984700f40e26e0fb06c99eeea1c033 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 18 Jul 2009 22:07:25 -0700 Subject: [PATCH 47/50] Add more sem_timedwait tests. --- nptl/ChangeLog | 3 +++ nptl/Makefile | 2 ++ nptl/tst-sem5.c | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 44f9846365..dc576a7d51 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,8 @@ 2009-07-18 Ulrich Drepper + * tst-sem5.c (do_test): Add test for premature timeout. + * Makefile: Linu tst-sem5 with librt. + * sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S (pthread_rwlock_timedwrlock): If possible use FUTEX_WAIT_BITSET to directly use absolute timeout. diff --git a/nptl/Makefile b/nptl/Makefile index 42a0b86282..f96ed458e6 100644 --- a/nptl/Makefile +++ b/nptl/Makefile @@ -479,6 +479,7 @@ $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so ifeq (yes,$(build-shared)) $(objpfx)tst-cond11: $(common-objpfx)rt/librt.so $(objpfx)tst-cond19: $(common-objpfx)rt/librt.so +$(objpfx)tst-sem5: $(common-objpfx)rt/librt.so $(objpfx)tst-cancel17: $(common-objpfx)rt/librt.so $(objpfx)tst-cancelx17: $(common-objpfx)rt/librt.so $(objpfx)tst-cancel18: $(common-objpfx)rt/librt.so @@ -492,6 +493,7 @@ $(objpfx)tst-_res1: $(objpfx)tst-_res1mod2.so $(shared-thread-library) else $(objpfx)tst-cond11: $(common-objpfx)rt/librt.a $(objpfx)tst-cond19: $(common-objpfx)rt/librt.a +$(objpfx)tst-sem5: $(common-objpfx)rt/librt.a $(objpfx)tst-cancel17: $(common-objpfx)rt/librt.a $(objpfx)tst-cancelx17: $(common-objpfx)rt/librt.a $(objpfx)tst-cancel18: $(common-objpfx)rt/librt.a diff --git a/nptl/tst-sem5.c b/nptl/tst-sem5.c index cb85b8e769..d3ebe26a40 100644 --- a/nptl/tst-sem5.c +++ b/nptl/tst-sem5.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -73,6 +73,20 @@ do_test (void) return 1; } + struct timespec ts2; + if (clock_gettime (CLOCK_REALTIME, &ts2) != 0) + { + puts ("clock_gettime failed"); + return 1; + } + + if (ts2.tv_sec < ts.tv_sec + || (ts2.tv_sec == ts.tv_sec && ts2.tv_nsec < ts.tv_nsec)) + { + puts ("timeout too short"); + return 1; + } + return 0; } From e2dca2fea3f1a0a7b05fd10589f469496f9c42a3 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 19 Jul 2009 00:00:17 -0700 Subject: [PATCH 48/50] Extend x86-64 __lll_robust_timedlock_wait to use futex syscall with absolute timeout. --- nptl/ChangeLog | 4 + .../sysv/linux/x86_64/lowlevelrobustlock.S | 78 ++++++++++++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index dc576a7d51..e9cac73459 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,9 @@ 2009-07-18 Ulrich Drepper + * sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S + (__lll_robust_timedlock_wait): If possible use FUTEX_WAIT_BITSET to + directly use absolute timeout. + * tst-sem5.c (do_test): Add test for premature timeout. * Makefile: Linu tst-sem5 with librt. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S index fa7516ef71..02db0a4f9d 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -32,6 +32,8 @@ #ifdef __ASSUME_PRIVATE_FUTEX # define LOAD_FUTEX_WAIT(reg) \ xorl $(FUTEX_WAIT | FUTEX_PRIVATE_FLAG), reg +# define LOAD_FUTEX_WAIT_ABS(reg) \ + xorl $(FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME), reg #else # if FUTEX_WAIT == 0 # define LOAD_FUTEX_WAIT(reg) \ @@ -43,6 +45,10 @@ andl %fs:PRIVATE_FUTEX, reg ; \ orl $FUTEX_WAIT, reg # endif +# define LOAD_FUTEX_WAIT_ABS(reg) \ + xorl $FUTEX_PRIVATE_FLAG, reg ; \ + andl %fs:PRIVATE_FUTEX, reg ; \ + orl $FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME, reg #endif /* For the calculation see asm/vsyscall.h. */ @@ -110,6 +116,73 @@ __lll_robust_lock_wait: .align 16 __lll_robust_timedlock_wait: cfi_startproc +# ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +# endif + + pushq %r9 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r9, 0) + movq %rdx, %r10 + movl $0xffffffff, %r9d + LOAD_FUTEX_WAIT_ABS (%esi) + +1: testl $FUTEX_OWNER_DIED, %eax + jnz 3f + + movl %eax, %edx + orl $FUTEX_WAITERS, %edx + + cmpl %eax, %edx + je 5f + + LOCK + cmpxchgl %edx, (%rdi) + movq $0, %rcx /* Must use mov to avoid changing cc. */ + jnz 6f + +5: movl $SYS_futex, %eax + syscall + movl %eax, %ecx + + movl (%rdi), %eax + +6: testl %eax, %eax + jne 2f + + movl %fs:TID, %edx + orl $FUTEX_WAITERS, %edx + LOCK + cmpxchgl %edx, (%rdi) + jnz 2f + +3: popq %r9 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r9) + retq + + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r9, 0) + /* Check whether the time expired. */ +2: cmpl $-ETIMEDOUT, %ecx + je 4f + cmpl $-EINVAL, %ecx + jne 1b + +4: movl %ecx, %eax + negl %eax + jmp 3b + cfi_adjust_cfa_offset(-8) + cfi_restore(%r9) + + +# ifndef __ASSUME_FUTEX_CLOCK_REALTIME +.Lreltmo: /* Check for a valid timeout value. */ cmpq $1000000000, 8(%rdx) jae 3f @@ -223,10 +296,11 @@ __lll_robust_timedlock_wait: cfi_offset(%r12, -32) cfi_offset(%r13, -40) /* Check whether the time expired. */ -7: cmpq $-ETIMEDOUT, %rcx +7: cmpl $-ETIMEDOUT, %ecx jne 1b 8: movl $ETIMEDOUT, %eax jmp 6b +#endif cfi_endproc .size __lll_robust_timedlock_wait,.-__lll_robust_timedlock_wait From 515a8908cedcf7432270f410e4a749e4ce07a072 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 19 Jul 2009 14:54:56 -0700 Subject: [PATCH 49/50] Make x86-64 pthread_cond_timedwait more robust. It just happens that __pthread_enable_asynccancel doesn't modify the $rdi register. But this isn't guaranteed. Hence we reload the register after the calls. --- nptl/ChangeLog | 5 +++++ nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S | 2 ++ 2 files changed, 7 insertions(+) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index e9cac73459..785100d852 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,8 @@ +2009-07-19 Ulrich Drepper + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S + (__pthread_cond_timedwait): Make more robust. + 2009-07-18 Ulrich Drepper * sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index 45a9a4213b..1b19fdb8dc 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -153,6 +153,7 @@ __pthread_cond_timedwait: .LcleanupSTART1: 34: callq __pthread_enable_asynccancel movl %eax, (%rsp) + movq 8(%rsp), %rdi movq %r13, %r10 cmpq $-1, dep_mutex(%rdi) @@ -456,6 +457,7 @@ __pthread_cond_timedwait: .LcleanupSTART2: 4: callq __pthread_enable_asynccancel movl %eax, (%rsp) + movq 8(%rsp), %rdi leaq 32(%rsp), %r10 cmpq $-1, dep_mutex(%rdi) From 42e69bcf1137fccfd7a95645a9d316c6490b9ff9 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 19 Jul 2009 20:56:40 -0700 Subject: [PATCH 50/50] Support requeueing for condvars using PI mutex. x86-64 only. Add support for the new FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI options of futex. --- nptl/ChangeLog | 9 +++ .../unix/sysv/linux/x86_64/lowlevellock.h | 2 + .../linux/x86_64/pthread_cond_broadcast.S | 26 +++++-- .../sysv/linux/x86_64/pthread_cond_signal.S | 47 +++++++++---- .../linux/x86_64/pthread_cond_timedwait.S | 69 ++++++++++++++++--- .../sysv/linux/x86_64/pthread_cond_wait.S | 60 +++++++++++++--- 6 files changed, 177 insertions(+), 36 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 785100d852..c747be419f 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,5 +1,14 @@ 2009-07-19 Ulrich Drepper + * sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Define + FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S: If mutex + is a PI mutex, then use FUTEX_CMP_REQUEUE_PI. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S: Likewise. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: If mutex + is a PI mutex, then use FUTEX_WAIT_REQUEUE_PI. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Likewise. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S (__pthread_cond_timedwait): Make more robust. diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h index 0b7e3bbaba..9b15bfbc57 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h @@ -54,6 +54,8 @@ #define FUTEX_TRYLOCK_PI 8 #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S index 6155255eb0..0f10ec910c 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -70,12 +70,14 @@ __pthread_cond_broadcast: 8: cmpq $-1, %r8 je 9f - /* XXX: The kernel so far doesn't support requeue to PI futex. */ - /* XXX: The kernel only supports FUTEX_CMP_REQUEUE to the same - type of futex (private resp. shared). */ - testl $(PI_BIT | PS_BIT), MUTEX_KIND(%r8) + /* Do not use requeue for pshared condvars. */ + testl $PS_BIT, MUTEX_KIND(%r8) jne 9f + /* Requeue to a PI mutex if the PI bit is set. */ + testl $PI_BIT, MUTEX_KIND(%r8) + jne 81f + /* Wake up all threads. */ #ifdef __ASSUME_PRIVATE_FUTEX movl $(FUTEX_CMP_REQUEUE|FUTEX_PRIVATE_FLAG), %esi @@ -97,6 +99,20 @@ __pthread_cond_broadcast: 10: xorl %eax, %eax retq + /* Wake up all threads. */ +81: movl $(FUTEX_CMP_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + movl $SYS_futex, %eax + movl $1, %edx + movl $0x7fffffff, %r10d + syscall + + /* For any kind of error, which mainly is EAGAIN, we try again + with WAKE. The general test also covers running on old + kernels. */ + cmpq $-4095, %rax + jb 10b + jmp 9f + .align 16 /* Unlock. */ 4: LOCK diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S index 8f65f2cd69..f1050fea7c 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002-2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -56,19 +57,23 @@ __pthread_cond_signal: /* Wake up one thread. */ cmpq $-1, dep_mutex(%r8) + movl $FUTEX_WAKE_OP, %esi movl $1, %edx + movl $SYS_futex, %eax + je 8f + + /* Get the address of the mutex used. */ + movq dep_mutex(%r8), %rcx + testl $PI_BIT, MUTEX_KIND(%rcx) + jne 9f + #ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE_OP, %eax movl $(FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi #else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE_OP, %esi + orl %fs:PRIVATE_FUTEX, %esi #endif - movl $1, %r10d - movl $SYS_futex, %eax + +8: movl $1, %r10d #if cond_lock != 0 addq $cond_lock, %r8 #endif @@ -85,9 +90,27 @@ __pthread_cond_signal: xorl %eax, %eax retq -7: /* %esi should be either FUTEX_WAKE_OP or - FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG from the previous syscall. */ - xorl $(FUTEX_WAKE ^ FUTEX_WAKE_OP), %esi + /* Wake up one thread and requeue none in the PI Mutex case. */ +9: movl $(FUTEX_CMP_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + movq %rcx, %r8 + xorq %r10, %r10 + movl (%rdi), %r9d // XXX Can this be right? + syscall + + leaq -cond_futex(%rdi), %r8 + + /* For any kind of error, we try again with WAKE. + The general test also covers running on old kernels. */ + cmpq $-4095, %rax + jb 4f + +7: +#ifdef __ASSUME_PRIVATE_FUTEX + andl $FUTEX_PRIVATE_FLAG, %esi +#else + andl %fs:PRIVATE_FUTEX, %esi +#endif + orl $FUTEX_WAKE, %esi movl $SYS_futex, %eax /* %rdx should be 1 already from $FUTEX_WAKE_OP syscall. movl $1, %edx */ diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index 1b19fdb8dc..f81466e1a5 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,9 @@ __pthread_cond_timedwait: pushq %r14 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r14, 0) + pushq %r15 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r15, 0) #ifdef __ASSUME_FUTEX_CLOCK_REALTIME # define FRAME_SIZE 32 #else @@ -160,9 +164,41 @@ __pthread_cond_timedwait: movl $FUTEX_WAIT_BITSET, %eax movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi cmove %eax, %esi + je 60f + + movq dep_mutex(%rdi), %r8 + /* Requeue to a PI mutex if the PI bit is set. */ + testl $PI_BIT, MUTEX_KIND(%r8) + je 60f + + movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + xorl %eax, %eax /* The following only works like this because we only support two clocks, represented using a single bit. */ + testl $1, cond_nwaiters(%rdi) + movl $FUTEX_CLOCK_REALTIME, %edx + cmove %edx, %eax + orl %eax, %esi + movq %r12, %rdx + addq $cond_futex, %rdi + movl $SYS_futex, %eax + syscall + + movl $1, %r15d +#ifdef __ASSUME_REQUEUE_PI + jmp 62f +#else + cmpq $-4095, %rax + jnae 62f + + movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi + subq $cond_futex, %rdi +#endif + +60: xorl %r15d, %r15d xorl %eax, %eax + /* The following only works like this because we only support + two clocks, represented using a single bit. */ testl $1, cond_nwaiters(%rdi) movl $FUTEX_CLOCK_REALTIME, %edx movl $0xffffffff, %r9d @@ -172,7 +208,7 @@ __pthread_cond_timedwait: addq $cond_futex, %rdi movl $SYS_futex, %eax syscall - movq %rax, %r14 +62: movq %rax, %r14 movl (%rsp), %edi callq __pthread_disable_asynccancel @@ -253,14 +289,23 @@ __pthread_cond_timedwait: #endif jne 40f -41: movq 16(%rsp), %rdi + /* If requeue_pi is used the kernel performs the locking of the + mutex. */ +41: xorl %eax, %eax + testl %r15d, %r15d + jnz 63f + + movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock - testq %rax, %rax +63: testq %rax, %rax cmoveq %r14, %rax 48: addq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(-FRAME_SIZE) + popq %r15 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r15) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) @@ -274,10 +319,11 @@ __pthread_cond_timedwait: retq /* Initial locking failed. */ -31: cfi_adjust_cfa_offset(3 * 8 + FRAME_SIZE) - cfi_rel_offset(%r12, FRAME_SIZE + 16) - cfi_rel_offset(%r13, FRAME_SIZE + 8) - cfi_rel_offset(%r14, FRAME_SIZE) +31: cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE) + cfi_rel_offset(%r12, FRAME_SIZE + 24) + cfi_rel_offset(%r13, FRAME_SIZE + 16) + cfi_rel_offset(%r14, FRAME_SIZE + 8) + cfi_rel_offset(%r15, FRAME_SIZE) #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -353,6 +399,8 @@ __pthread_cond_timedwait: #ifndef __ASSUME_FUTEX_CLOCK_REALTIME .Lreltmo: + xorl %r15d, %r15d + /* Get internal lock. */ movl $1, %esi xorl %eax, %eax @@ -716,9 +764,10 @@ __condvar_cleanup2: callq __pthread_mutex_cond_lock movq 24(%rsp), %rdi - movq FRAME_SIZE(%rsp), %r14 - movq FRAME_SIZE+8(%rsp), %r13 - movq FRAME_SIZE+16(%rsp), %r12 + movq FRAME_SIZE(%rsp), %r15 + movq FRAME_SIZE+8(%rsp), %r14 + movq FRAME_SIZE+16(%rsp), %r13 + movq FRAME_SIZE+24(%rsp), %r12 .LcallUR: call _Unwind_Resume@PLT hlt diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index c3c879cde9..e6323ea3e2 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -47,6 +48,9 @@ __pthread_cond_wait: pushq %r12 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r12, 0) + pushq %r13 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) #define FRAME_SIZE 32 subq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(FRAME_SIZE) @@ -124,24 +128,48 @@ __pthread_cond_wait: movq 8(%rsp), %rdi xorq %r10, %r10 movq %r12, %rdx - addq $cond_futex-cond_lock, %rdi + // XXX reverse + lea + addq $cond_futex, %rdi cmpq $-1, dep_mutex-cond_futex(%rdi) #ifdef __ASSUME_PRIVATE_FUTEX movl $FUTEX_WAIT, %eax movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi cmove %eax, %esi #else - movl $FUTEX_WAIT, %eax + movl $0, %eax movl %fs:PRIVATE_FUTEX, %esi cmove %eax, %esi # if FUTEX_WAIT != 0 +# error "cc destroyed by following orl" orl $FUTEX_WAIT, %esi # endif #endif + je 60f + + movq dep_mutex-cond_futex(%rdi), %r8 + /* Requeue to a PI mutex if the PI bit is set. */ + testl $PI_BIT, MUTEX_KIND(%r8) + je 60f + + movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + movl $SYS_futex, %eax + syscall + + movl $1, %r13d +#ifdef __ASSUME_REQUEUE_PI + jmp 62f +#else + cmpq $-4095, %rax + jnae 62f + + movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi +#endif + +60: xorl %r13d, %r13d movl $SYS_futex, %eax syscall - movl (%rsp), %edi +62: movl (%rsp), %edi callq __pthread_disable_asynccancel .LcleanupEND: @@ -209,11 +237,21 @@ __pthread_cond_wait: #endif jne 10f -11: movq 16(%rsp), %rdi + /* If requeue_pi is used the kernel performs the locking of the + mutex. */ +11: xorl %eax, %eax + testl %r13d, %r13d + jnz 14f + + movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock + 14: addq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(-FRAME_SIZE) + popq %r13 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) popq %r12 cfi_adjust_cfa_offset(-8) cfi_restore(%r12) @@ -223,8 +261,9 @@ __pthread_cond_wait: /* Initial locking failed. */ 1: - cfi_adjust_cfa_offset(8 + FRAME_SIZE) - cfi_rel_offset(%r12, FRAME_SIZE) + cfi_adjust_cfa_offset(16 + FRAME_SIZE) + cfi_rel_offset(%r12, FRAME_SIZE + 8) + cfi_rel_offset(%r13, FRAME_SIZE) #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -308,9 +347,11 @@ versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait, __condvar_cleanup1: /* Stack frame: - rsp + 40 + rsp + 48 + +--------------------------+ + rsp + 40 | %r12 | +--------------------------+ - rsp + 32 | %r12 | + rsp + 32 | %r13 | +--------------------------+ rsp + 24 | unused | +--------------------------+ @@ -431,7 +472,8 @@ __condvar_cleanup1: callq __pthread_mutex_cond_lock movq 24(%rsp), %rdi - movq 32(%rsp), %r12 + movq 40(%rsp), %r12 + movq 32(%rsp), %r13 .LcallUR: call _Unwind_Resume@PLT hlt