From b0948ffdcbdace63317297d3d3fe2556387dfcbd Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 28 Jul 2009 09:40:39 -0700 Subject: [PATCH 1/8] Fix bookkeeping in mutex when using requeue_pi. --- nptl/ChangeLog | 17 +++++++++++++ nptl/pthreadP.h | 2 ++ nptl/pthread_mutex_lock.c | 21 +++++++++++++++- nptl/pthread_mutex_unlock.c | 7 +++--- .../unix/sysv/linux/pthread-pi-defines.sym | 1 + .../linux/x86_64/pthread_cond_broadcast.S | 6 +++-- .../sysv/linux/x86_64/pthread_cond_signal.S | 6 +++-- .../linux/x86_64/pthread_cond_timedwait.S | 24 ++++++++++++------- .../sysv/linux/x86_64/pthread_cond_wait.S | 23 +++++++++++------- 9 files changed, 83 insertions(+), 24 deletions(-) diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 8dd93732b5..8f37da7936 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,20 @@ +2009-07-28 Ulrich Drepper + + * pthread_mutex_lock.c [NO_INCR] (__pthread_mutex_cond_lock_adjust): + New function. + * pthreadP.h: Declare __pthread_mutex_cond_lock_adjust. + * sysdeps/unix/sysv/linux/pthread-pi-defines.sym: Add ROBUST_BIT. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S: Don't use + requeue_pi for robust mutexes. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S: Likewise. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Likewise. + Don't only skip __pthread_mutex_cond_lock. Call instead + __pthread_mutex_cond_lock_adjust. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise. + + * pthread_mutex_unlock.c (__pthread_mutex_unlock_full): Minor + optimization of PI mutex handling. + 2009-07-27 Ulrich Drepper [BZ #10418] diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h index ed9fc625ba..43ca44c829 100644 --- a/nptl/pthreadP.h +++ b/nptl/pthreadP.h @@ -418,6 +418,8 @@ extern int __pthread_mutex_lock_internal (pthread_mutex_t *__mutex) attribute_hidden; extern int __pthread_mutex_cond_lock (pthread_mutex_t *__mutex) attribute_hidden internal_function; +extern void __pthread_mutex_cond_lock_adjust (pthread_mutex_t *__mutex) + attribute_hidden internal_function; extern int __pthread_mutex_unlock (pthread_mutex_t *__mutex); extern int __pthread_mutex_unlock_internal (pthread_mutex_t *__mutex) attribute_hidden; diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c index 406e588fdb..50dc18803d 100644 --- a/nptl/pthread_mutex_lock.c +++ b/nptl/pthread_mutex_lock.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2002. @@ -473,3 +473,22 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) strong_alias (__pthread_mutex_lock, pthread_mutex_lock) strong_alias (__pthread_mutex_lock, __pthread_mutex_lock_internal) #endif + + +#ifdef NO_INCR +void +__pthread_mutex_cond_lock_adjust (mutex) + pthread_mutex_t *mutex; +{ + assert ((mutex->__data.__kind & PTHREAD_MUTEX_PRIO_INHERIT_NP) != 0); + assert ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) == 0); + assert ((mutex->__data.__kind & PTHREAD_MUTEX_PSHARED_BIT) == 0); + + /* Record the ownership. */ + pid_t id = THREAD_GETMEM (THREAD_SELF, tid); + mutex->__data.__owner = id; + + if (mutex->__data.__kind == PTHREAD_MUTEX_PI_RECURSIVE_NP) + ++mutex->__data.__count; +} +#endif diff --git a/nptl/pthread_mutex_unlock.c b/nptl/pthread_mutex_unlock.c index fbe8274a55..f9fe10b0f2 100644 --- a/nptl/pthread_mutex_unlock.c +++ b/nptl/pthread_mutex_unlock.c @@ -150,7 +150,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) if (--mutex->__data.__count != 0) /* We still hold the mutex. */ return 0; - goto continue_pi; + goto continue_pi_non_robust; case PTHREAD_MUTEX_PI_ROBUST_RECURSIVE_NP: /* Recursive mutex. */ @@ -173,7 +173,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) /* We still hold the mutex. */ return 0; - goto continue_pi; + goto continue_pi_robust; case PTHREAD_MUTEX_PI_ERRORCHECK_NP: case PTHREAD_MUTEX_PI_NORMAL_NP: @@ -195,9 +195,9 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) pi_notrecoverable: newowner = PTHREAD_MUTEX_NOTRECOVERABLE; - continue_pi: if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0) { + continue_pi_robust: /* Remove mutex from the list. Note: robust PI futexes are signaled by setting bit 0. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, @@ -206,6 +206,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) DEQUEUE_MUTEX (mutex); } + continue_pi_non_robust: mutex->__data.__owner = newowner; if (decr) /* One less user. */ diff --git a/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym b/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym index d985c6a79b..46fbd0de74 100644 --- a/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym +++ b/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym @@ -3,5 +3,6 @@ -- These PI macros are used by assembly code. MUTEX_KIND offsetof (pthread_mutex_t, __data.__kind) +ROBUST_BIT PTHREAD_MUTEX_ROBUST_NORMAL_NP PI_BIT PTHREAD_MUTEX_PRIO_INHERIT_NP PS_BIT PTHREAD_MUTEX_PSHARED_BIT diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S index 0f10ec910c..224a56088e 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S @@ -75,8 +75,10 @@ __pthread_cond_broadcast: jne 9f /* Requeue to a PI mutex if the PI bit is set. */ - testl $PI_BIT, MUTEX_KIND(%r8) - jne 81f + movl MUTEX_KIND(%r8), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + je 81f /* Wake up all threads. */ #ifdef __ASSUME_PRIVATE_FUTEX diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S index f1050fea7c..4d001eec7f 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S @@ -64,8 +64,10 @@ __pthread_cond_signal: /* Get the address of the mutex used. */ movq dep_mutex(%r8), %rcx - testl $PI_BIT, MUTEX_KIND(%rcx) - jne 9f + movl MUTEX_KIND(%rcx), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + je 9f #ifdef __ASSUME_PRIVATE_FUTEX movl $(FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG), %esi diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index 7486825d5f..4913beb8af 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -165,9 +165,12 @@ __pthread_cond_timedwait: je 60f movq dep_mutex(%rdi), %r8 - /* Requeue to a PI mutex if the PI bit is set. */ - testl $PI_BIT, MUTEX_KIND(%r8) - je 61f + /* Requeue to a non-robust PI mutex if the PI bit is set and + the robust bit is not set. */ + movl MUTEX_KIND(%r8), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + jne 61f movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi xorl %eax, %eax @@ -289,11 +292,10 @@ __pthread_cond_timedwait: /* If requeue_pi is used the kernel performs the locking of the mutex. */ -41: xorl %eax, %eax +41: movq 16(%rsp), %rdi testl %r15d, %r15d - jnz 63f + jnz 64f - movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock 63: testq %rax, %rax @@ -316,12 +318,18 @@ __pthread_cond_timedwait: retq - /* Initial locking failed. */ -31: cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE) + cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE) cfi_rel_offset(%r12, FRAME_SIZE + 24) cfi_rel_offset(%r13, FRAME_SIZE + 16) cfi_rel_offset(%r14, FRAME_SIZE + 8) cfi_rel_offset(%r15, FRAME_SIZE) + +64: callq __pthread_mutex_cond_lock_adjust + movq %r14, %rax + jmp 48b + + /* Initial locking failed. */ +31: #if cond_lock != 0 addq $cond_lock, %rdi #endif diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index 2fab38e277..a66523eab6 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -134,9 +134,12 @@ __pthread_cond_wait: je 60f movq dep_mutex-cond_futex(%rdi), %r8 - /* Requeue to a PI mutex if the PI bit is set. */ - testl $PI_BIT, MUTEX_KIND(%r8) - je 61f + /* Requeue to a non-robust PI mutex if the PI bit is set and + the robust bit is not set. */ + movl MUTEX_KIND(%r8), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + jne 61f movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi movl $SYS_futex, %eax @@ -234,11 +237,10 @@ __pthread_cond_wait: /* If requeue_pi is used the kernel performs the locking of the mutex. */ -11: xorl %eax, %eax +11: movq 16(%rsp), %rdi testl %r13d, %r13d - jnz 14f + jnz 18f - movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock 14: addq $FRAME_SIZE, %rsp @@ -254,11 +256,16 @@ __pthread_cond_wait: /* We return the result of the mutex_lock operation. */ retq - /* Initial locking failed. */ -1: cfi_adjust_cfa_offset(16 + FRAME_SIZE) cfi_rel_offset(%r12, FRAME_SIZE + 8) cfi_rel_offset(%r13, FRAME_SIZE) + +18: callq __pthread_mutex_cond_lock_adjust + xorl %eax, %eax + jmp 14b + + /* Initial locking failed. */ +1: #if cond_lock != 0 addq $cond_lock, %rdi #endif From 9655389317c92e5935c47d90c0ba48ca54bd245e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 28 Jul 2009 21:58:32 -0700 Subject: [PATCH 2/8] Fix bookkeeping of static TLS block for TLS_TCB_AT_TP architectures. --- ChangeLog | 9 +++++++++ csu/libc-tls.c | 9 +++++---- elf/dl-reloc.c | 5 ++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 60b76547c1..ff34e5f5d5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2009-07-28 Ulrich Drepper + + * csu/libc-tls.c (__libc_setup_tls) [TLS_TCB_AT_TP]: Don't add TCB + size to memsz. + (init_static_tls) [TLS_TCB_AT_TP]: Add it to GL(dl_tls_static_size) + here. + * elf/dl-reloc.c (_dl_try_allocate_static_tls): Compute freebytes in + two steps to catch bugs. + 2009-07-27 Ulrich Drepper * sysdeps/x86_64/tst-xmmymm.sh: Refine testing. The script now diff --git a/csu/libc-tls.c b/csu/libc-tls.c index 0d240ccef9..5a49942861 100644 --- a/csu/libc-tls.c +++ b/csu/libc-tls.c @@ -1,5 +1,5 @@ /* Initialization code for TLS in statically linked application. - Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. + Copyright (C) 2002-2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -99,6 +99,9 @@ init_static_tls (size_t memsz, size_t align) surplus that permits dynamic loading of modules with IE-model TLS. */ GL(dl_tls_static_size) = roundup (memsz + GL(dl_tls_static_size), TLS_TCB_ALIGN); +#if TLS_TCB_AT_TP + GL(dl_tls_static_size) += TLS_TCB_SIZE; +#endif GL(dl_tls_static_used) = memsz; /* The alignment requirement for the static TLS block. */ GL(dl_tls_static_align) = align; @@ -211,9 +214,7 @@ __libc_setup_tls (size_t tcbsize, size_t tcbalign) memsz = roundup (memsz, align ?: 1); -#if TLS_TCB_AT_TP - memsz += tcbsize; -#elif TLS_DTV_AT_TP +#if TLS_DTV_AT_TP memsz += tcb_offset; #endif diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c index 28f08de3e7..680caadd65 100644 --- a/elf/dl-reloc.c +++ b/elf/dl-reloc.c @@ -61,7 +61,10 @@ _dl_try_allocate_static_tls (struct link_map *map) size_t n; size_t blsize; - freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used) - TLS_TCB_SIZE; + freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used); + if (freebytes < TLS_TCB_SIZE) + goto fail; + freebytes -= TLS_TCB_SIZE; blsize = map->l_tls_blocksize + map->l_tls_firstbyte_offset; if (freebytes < blsize) From b48a267b8fbb885191a04cffdb4050a4d4c8a20b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 29 Jul 2009 08:33:03 -0700 Subject: [PATCH 3/8] Preserve SSE registers in runtime relocations on x86-64. SSE registers are used for passing parameters and must be preserved in runtime relocations. This is inside ld.so enforced through the tests in tst-xmmymm.sh. But the malloc routines used after startup come from libc.so and can be arbitrarily complex. It's overkill to save the SSE registers all the time because of that. These calls are rare. Instead we save them on demand. The new infrastructure put in place in this patch makes this possible and efficient. --- ChangeLog | 15 ++++++ elf/dl-lookup.c | 13 +++++ elf/dl-runtime.c | 8 +++ nptl/ChangeLog | 8 +++ nptl/sysdeps/x86_64/tcb-offsets.sym | 1 + nptl/sysdeps/x86_64/tls.h | 73 ++++++++++++++++++------- stdio-common/scanf15.c | 1 + stdio-common/scanf17.c | 1 + sysdeps/x86_64/dl-trampoline.S | 82 +++++++++++++++++++++++++++++ sysdeps/x86_64/tst-xmmymm.sh | 7 +-- 10 files changed, 188 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index ff34e5f5d5..23e6906d06 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2009-07-29 Ulrich Drepper + + * elf/dl-runtime.c (_dl_fixup): Indicate before _dl_lookup_symbol_x + call that registers used in calling conventions need to be preserved. + * elf/dl-lookup.c (do_lookup_x): Use RTLD_*_FOREIGN_CALL macros + to preserve register content if necessary. + * sysdeps/x86_64/dl-trampoline.S (_dl_x86_64_save_sse): New function. + (_dl_x86_64_restore_sse): New function. + * sysdeps/x86_64/tst-xmmymm.sh: There is now one more function that + is allowed to modify xmm/ymm registers. + + * stdio-common/scanf15.c: Undefine _LIBC. We want to test from an + application's perspective. + * stdio-common/scanf17.c: Likewise. + 2009-07-28 Ulrich Drepper * csu/libc-tls.c (__libc_setup_tls) [TLS_TCB_AT_TP]: Don't add TCB diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 1d68d67a35..56724c9b4d 100644 --- a/elf/dl-lookup.c +++ b/elf/dl-lookup.c @@ -380,6 +380,10 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, if (size * 3 <= tab->n_elements * 4) { /* Expand the table. */ +#ifdef RTLD_CHECK_FOREIGN_CALL + /* This must not happen during runtime relocations. */ + assert (!RTLD_CHECK_FOREIGN_CALL); +#endif size_t newsize = _dl_higher_prime_number (size + 1); struct unique_sym *newentries = calloc (sizeof (struct unique_sym), newsize); @@ -405,6 +409,11 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, } else { +#ifdef RTLD_CHECK_FOREIGN_CALL + /* This must not happen during runtime relocations. */ + assert (!RTLD_CHECK_FOREIGN_CALL); +#endif + #define INITIAL_NUNIQUE_SYM_TABLE 31 size = INITIAL_NUNIQUE_SYM_TABLE; entries = calloc (sizeof (struct unique_sym), size); @@ -600,6 +609,10 @@ add_dependency (struct link_map *undef_map, struct link_map *map, int flags) unsigned int max = undef_map->l_reldepsmax ? undef_map->l_reldepsmax * 2 : 10; +#ifdef RTLD_PREPARE_FOREIGN_CALL + RTLD_PREPARE_FOREIGN_CALL; +#endif + newp = malloc (sizeof (*newp) + max * sizeof (struct link_map *)); if (newp == NULL) { diff --git a/elf/dl-runtime.c b/elf/dl-runtime.c index 0eb7d4e3b9..a52120d121 100644 --- a/elf/dl-runtime.c +++ b/elf/dl-runtime.c @@ -111,6 +111,10 @@ _dl_fixup ( flags |= DL_LOOKUP_GSCOPE_LOCK; } +#ifdef RTLD_ENABLE_FOREIGN_CALL + RTLD_ENABLE_FOREIGN_CALL; +#endif + result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, l->l_scope, version, ELF_RTYPE_CLASS_PLT, flags, NULL); @@ -118,6 +122,10 @@ _dl_fixup ( if (!RTLD_SINGLE_THREAD_P) THREAD_GSCOPE_RESET_FLAG (); +#ifdef RTLD_FINALIZE_FOREIGN_CALL + RTLD_FINALIZE_FOREIGN_CALL; +#endif + /* Currently result contains the base load address (or link map) of the object that defines sym. Now add in the symbol offset. */ diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 8f37da7936..24fd28a0dc 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,11 @@ +2009-07-29 Ulrich Drepper + + * sysdeps/x86_64/tls.h (tcbhead_t): Add room for SSE registers the + dynamic linker might have to save. Define RTLD_CHECK_FOREIGN_CALL, + RTLD_ENABLE_FOREIGN_CALL, RTLD_PREPARE_FOREIGN_CALL, and + RTLD_FINALIZE_FOREIGN_CALL. Pretty printing. + * sysdeps/x86_64/tcb-offsets.sym: Add RTLD_SAVESPACE_SSE. + 2009-07-28 Ulrich Drepper * pthread_mutex_lock.c [NO_INCR] (__pthread_mutex_cond_lock_adjust): diff --git a/nptl/sysdeps/x86_64/tcb-offsets.sym b/nptl/sysdeps/x86_64/tcb-offsets.sym index 1c70c6bde7..51f35c61cf 100644 --- a/nptl/sysdeps/x86_64/tcb-offsets.sym +++ b/nptl/sysdeps/x86_64/tcb-offsets.sym @@ -15,3 +15,4 @@ VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache) #ifndef __ASSUME_PRIVATE_FUTEX PRIVATE_FUTEX offsetof (tcbhead_t, private_futex) #endif +RTLD_SAVESPACE_SSE offsetof (tcbhead_t, rtld_savespace_sse) diff --git a/nptl/sysdeps/x86_64/tls.h b/nptl/sysdeps/x86_64/tls.h index ea89f3b1a2..a51b77052a 100644 --- a/nptl/sysdeps/x86_64/tls.h +++ b/nptl/sysdeps/x86_64/tls.h @@ -29,6 +29,7 @@ # include # include # include +# include /* Type for the dtv. */ @@ -55,16 +56,23 @@ typedef struct uintptr_t stack_guard; uintptr_t pointer_guard; unsigned long int vgetcpu_cache[2]; -#ifndef __ASSUME_PRIVATE_FUTEX +# ifndef __ASSUME_PRIVATE_FUTEX int private_futex; -#else +# else int __unused1; -#endif -#if __WORDSIZE == 64 - int __pad1; -#endif +# endif +# if __WORDSIZE == 64 + int rtld_must_xmm_save; +# endif /* Reservation of some values for the TM ABI. */ void *__private_tm[5]; +# if __WORDSIZE == 64 + long int __unused2; + /* Have space for the post-AVX register size. */ + __m128 rtld_savespace_sse[8][4]; + + void *__padding[8]; +# endif } tcbhead_t; #else /* __ASSEMBLER__ */ @@ -298,7 +306,7 @@ typedef struct /* Atomic compare and exchange on TLS, returning old value. */ -#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ +# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ ({ __typeof (descr->member) __ret; \ __typeof (oldval) __old = (oldval); \ if (sizeof (descr->member) == 4) \ @@ -313,7 +321,7 @@ typedef struct /* Atomic logical and. */ -#define THREAD_ATOMIC_AND(descr, member, val) \ +# define THREAD_ATOMIC_AND(descr, member, val) \ (void) ({ if (sizeof ((descr)->member) == 4) \ asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0" \ :: "i" (offsetof (struct pthread, member)), \ @@ -324,7 +332,7 @@ typedef struct /* Atomic set bit. */ -#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ +# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ (void) ({ if (sizeof ((descr)->member) == 4) \ asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0" \ :: "i" (offsetof (struct pthread, member)), \ @@ -334,7 +342,7 @@ typedef struct abort (); }) -#define CALL_THREAD_FCT(descr) \ +# define CALL_THREAD_FCT(descr) \ ({ void *__res; \ asm volatile ("movq %%fs:%P2, %%rdi\n\t" \ "callq *%%fs:%P1" \ @@ -355,18 +363,18 @@ typedef struct /* Set the pointer guard field in the TCB head. */ -#define THREAD_SET_POINTER_GUARD(value) \ +# define THREAD_SET_POINTER_GUARD(value) \ THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value) -#define THREAD_COPY_POINTER_GUARD(descr) \ +# define THREAD_COPY_POINTER_GUARD(descr) \ ((descr)->header.pointer_guard \ = THREAD_GETMEM (THREAD_SELF, header.pointer_guard)) /* Get and set the global scope generation counter in the TCB head. */ -#define THREAD_GSCOPE_FLAG_UNUSED 0 -#define THREAD_GSCOPE_FLAG_USED 1 -#define THREAD_GSCOPE_FLAG_WAIT 2 -#define THREAD_GSCOPE_RESET_FLAG() \ +# define THREAD_GSCOPE_FLAG_UNUSED 0 +# define THREAD_GSCOPE_FLAG_USED 1 +# define THREAD_GSCOPE_FLAG_WAIT 2 +# define THREAD_GSCOPE_RESET_FLAG() \ do \ { int __res; \ asm volatile ("xchgl %0, %%fs:%P1" \ @@ -377,11 +385,40 @@ typedef struct lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \ } \ while (0) -#define THREAD_GSCOPE_SET_FLAG() \ +# define THREAD_GSCOPE_SET_FLAG() \ THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED) -#define THREAD_GSCOPE_WAIT() \ +# define THREAD_GSCOPE_WAIT() \ GL(dl_wait_lookup_done) () + +# ifdef SHARED +/* Defined in dl-trampoline.S. */ +extern void _dl_x86_64_save_sse (void); +extern void _dl_x86_64_restore_sse (void); + +# define RTLD_CHECK_FOREIGN_CALL \ + (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0) + +# define RTLD_ENABLE_FOREIGN_CALL \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1) + +# define RTLD_PREPARE_FOREIGN_CALL \ + do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save)) \ + { \ + _dl_x86_64_save_sse (); \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \ + } \ + while (0) + +# define RTLD_FINALIZE_FOREIGN_CALL \ + do { \ + if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0) \ + _dl_x86_64_restore_sse (); \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \ + } while (0) +# endif + + #endif /* __ASSEMBLER__ */ #endif /* tls.h */ diff --git a/stdio-common/scanf15.c b/stdio-common/scanf15.c index c56715c486..851466b3a9 100644 --- a/stdio-common/scanf15.c +++ b/stdio-common/scanf15.c @@ -1,5 +1,6 @@ #undef _GNU_SOURCE #define _XOPEN_SOURCE 600 +#undef _LIBC /* The following macro definitions are a hack. They word around disabling the GNU extension while still using a few internal headers. */ #define u_char unsigned char diff --git a/stdio-common/scanf17.c b/stdio-common/scanf17.c index ee9024f9b7..4478a7022f 100644 --- a/stdio-common/scanf17.c +++ b/stdio-common/scanf17.c @@ -1,5 +1,6 @@ #undef _GNU_SOURCE #define _XOPEN_SOURCE 600 +#undef _LIBC /* The following macro definitions are a hack. They word around disabling the GNU extension while still using a few internal headers. */ #define u_char unsigned char diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 49d239f075..7ecf1b0c64 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -390,3 +390,85 @@ L(no_avx4): cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif + + +#ifdef SHARED + .globl _dl_x86_64_save_sse + .type _dl_x86_64_save_sse, @function + .align 16 + cfi_startproc +_dl_x86_64_save_sse: +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_avx)(%rip) + cmpl $0, %eax + +1: js L(no_avx5) + +# define YMM_SIZE 32 + vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE + vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE + vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE + vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE + vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE + vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE + vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE + vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE + ret +L(no_avx5): +# endif +# define YMM_SIZE 16 + movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE + movdqa %xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE + movdqa %xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE + movdqa %xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE + movdqa %xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE + movdqa %xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE + movdqa %xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE + movdqa %xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE + ret + cfi_endproc + .size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse + + + .globl _dl_x86_64_restore_sse + .type _dl_x86_64_restore_sse, @function + .align 16 + cfi_startproc +_dl_x86_64_restore_sse: +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx6) + + vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0 + vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1 + vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2 + vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3 + vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4 + vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5 + vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6 + vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7 + ret +L(no_avx6): +# endif + movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0 + movdqa %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1 + movdqa %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2 + movdqa %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3 + movdqa %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4 + movdqa %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5 + movdqa %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6 + movdqa %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7 + ret + cfi_endproc + .size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse +#endif diff --git a/sysdeps/x86_64/tst-xmmymm.sh b/sysdeps/x86_64/tst-xmmymm.sh index a576e7da0d..da8af7e686 100755 --- a/sysdeps/x86_64/tst-xmmymm.sh +++ b/sysdeps/x86_64/tst-xmmymm.sh @@ -59,10 +59,11 @@ for f in $tocheck; do objdump -d "$objpfx"../*/"$f" | awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' | while read fct; do - if test "$fct" != "_dl_runtime_profile"; then - echo "function $fct in $f modifies xmm/ymm" >> "$tmp" - result=1 + if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then + continue; fi + echo "function $fct in $f modifies xmm/ymm" >> "$tmp" + result=1 done done From 09e0389eb12491d3e9ef74b299b66efdd67adb1c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 29 Jul 2009 08:40:54 -0700 Subject: [PATCH 4/8] Properly restore AVX registers on x86-64. tst-audit4 and tst-audit5 fail under AVX emulator due to je instead of jne. This patch fixes them. --- ChangeLog | 4 ++++ sysdeps/x86_64/dl-trampoline.S | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index 23e6906d06..856689f1c0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2009-07-28 H.J. Lu + + * sysdeps/x86_64/dl-trampoline.S: Properly restore AVX registers. + 2009-07-29 Ulrich Drepper * elf/dl-runtime.c (_dl_fixup): Indicate before _dl_lookup_symbol_x diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 7ecf1b0c64..5a4c6ddecd 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -203,49 +203,49 @@ L(no_avx1): vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 L(no_avx2): @@ -361,13 +361,13 @@ L(no_avx3): vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 L(no_avx4): From 649bf1332071954cbae3e9159708aea1b7c9ae31 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 29 Jul 2009 08:50:03 -0700 Subject: [PATCH 5/8] Improve CFI in x86-64 ld.so trampoline code. --- ChangeLog | 5 +++++ sysdeps/x86_64/dl-trampoline.S | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 856689f1c0..081d59b28d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-29 Ulrich Drepper + + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI + information. + 2009-07-28 H.J. Lu * sysdeps/x86_64/dl-trampoline.S: Properly restore AVX registers. diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 5a4c6ddecd..20da6956f1 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -61,6 +61,7 @@ _dl_runtime_resolve: cfi_startproc _dl_runtime_profile: + cfi_adjust_cfa_offset(16) # Incorporate PLT /* The La_x86_64_regs data structure pointed to by the fourth paramater must be 16-byte aligned. This must be explicitly enforced. We have the set up a dynamically @@ -68,7 +69,7 @@ _dl_runtime_profile: has a fixed size and preserves the original stack pointer. */ subq $32, %rsp # Allocate the local storage. - cfi_adjust_cfa_offset(48) # Incorporate PLT + cfi_adjust_cfa_offset(32) movq %rbx, (%rsp) cfi_rel_offset(%rbx, 0) From 586fa886ad1473759cddf897691fd3c63a6d2360 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 29 Jul 2009 09:01:04 -0700 Subject: [PATCH 6/8] Fix x86-64 TCB alignment for future processor versions. --- ChangeLog | 3 +-- nptl/ChangeLog | 10 +++++++--- nptl/sysdeps/x86_64/tls.h | 7 ++++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index 081d59b28d..0273a595c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,6 @@ 2009-07-29 Ulrich Drepper - * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI - information. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI. 2009-07-28 H.J. Lu diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 24fd28a0dc..20031b5ae6 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,9 +1,13 @@ 2009-07-29 Ulrich Drepper + * sysdeps/x86_64/tls.h (TLS_TCB_ALIGN): Define explicitly to 32. + * sysdeps/x86_64/tls.h (tcbhead_t): Add room for SSE registers the - dynamic linker might have to save. Define RTLD_CHECK_FOREIGN_CALL, - RTLD_ENABLE_FOREIGN_CALL, RTLD_PREPARE_FOREIGN_CALL, and - RTLD_FINALIZE_FOREIGN_CALL. Pretty printing. + dynamic linker might have to save. + Define RTLD_CHECK_FOREIGN_CALL, RTLD_ENABLE_FOREIGN_CALL, + RTLD_PREPARE_FOREIGN_CALL, and RTLD_FINALIZE_FOREIGN_CALL. Pretty + printing. + * sysdeps/x86_64/tcb-offsets.sym: Add RTLD_SAVESPACE_SSE. 2009-07-28 Ulrich Drepper diff --git a/nptl/sysdeps/x86_64/tls.h b/nptl/sysdeps/x86_64/tls.h index a51b77052a..4212038ab5 100644 --- a/nptl/sysdeps/x86_64/tls.h +++ b/nptl/sysdeps/x86_64/tls.h @@ -117,7 +117,12 @@ typedef struct # define TLS_TCB_SIZE sizeof (struct pthread) /* Alignment requirements for the TCB. */ -# define TLS_TCB_ALIGN __alignof__ (struct pthread) +//# define TLS_TCB_ALIGN __alignof__ (struct pthread) +// Normally the above would be correct But we have to store post-AVX +// vector registers in the TCB and we want the storage to be aligned. +// unfortunately there isn't yet a type for these values and hence no +// 32-byte alignment requirement. Make this explicit, for now. +# define TLS_TCB_ALIGN 32 /* The TCB can have any size and the memory following the address the thread pointer points to is unspecified. Allocate the TCB there. */ From 9a1d2d455540ff99a586da5b550cc768f4f6fd5c Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 29 Jul 2009 15:22:28 -0700 Subject: [PATCH 7/8] Prepare use if IFUNC functions outside libc.so. We use a callback function into libc.so to get access to the data structure with the information and have special versions of the test macros which automatically use this function. --- include/libc-symbols.h | 13 ++++++++++++- sysdeps/x86_64/multiarch/init-arch.c | 10 ++++++++++ sysdeps/x86_64/multiarch/init-arch.h | 22 ++++++++++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/include/libc-symbols.h b/include/libc-symbols.h index 68da77c58e..252141eb01 100644 --- a/include/libc-symbols.h +++ b/include/libc-symbols.h @@ -1,6 +1,6 @@ /* Support macros for making weak and strong aliases for symbols, and for using symbol sets and linker warnings with GNU ld. - Copyright (C) 1995-1998, 2000-2006, 2008 Free Software Foundation, Inc. + Copyright (C) 1995-1998,2000-2006,2008,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -845,6 +845,17 @@ for linking") } \ __asm__ (".type " #name ", %gnu_indirect_function"); +/* The body of the function is supposed to use __get_cpu_features + which will, if necessary, initialize the data first. */ +#define libm_ifunc(name, expr) \ + extern void *name##_ifunc (void) __asm__ (#name); \ + void *name##_ifunc (void) \ + { \ + __typeof (name) *res = expr; \ + return res; \ + } \ + __asm__ (".type " #name ", %gnu_indirect_function"); + #ifdef HAVE_ASM_SET_DIRECTIVE # define libc_ifunc_hidden_def1(local, name) \ __asm__ (declare_symbol_alias_1_stringify (ASM_GLOBAL_DIRECTIVE) \ diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 35fd19af0e..49b421eac8 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -86,3 +86,13 @@ __init_cpu_features (void) else __cpu_features.kind = arch_kind_other; } + + +const struct cpu_features * +__get_cpu_features (void) +{ + if (__cpu_features.kind == arch_kind_unknown) + __init_cpu_features (); + + return &__cpu_features; +} diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 48a2127418..0151e8b95b 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -54,10 +54,28 @@ extern void __init_cpu_features (void) attribute_hidden; __init_cpu_features (); \ while (0) +/* Used from outside libc.so to get access to the CPU features structure. */ +extern const struct cpu_features *__get_cpu_features (void) + __attribute__ ((const)); + /* Following are the feature tests used throughout libc. */ -#define HAS_POPCOUNT \ +#ifndef NOT_IN_libc +# define HAS_POPCOUNT \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) -#define HAS_SSE4_2 \ +# define HAS_SSE4_2 \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) + +# define HAS_FMA \ + ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0) +#else +# define HAS_POPCOUNT \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) + +# define HAS_SSE4_2 \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) + +# define HAS_FMA \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0) +#endif From 78c4ef475d47a2289635f74b726f52defedb4651 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 29 Jul 2009 15:26:06 -0700 Subject: [PATCH 8/8] Add support for x86-64 fma instruction. Use it to implement fma and fmaf, if possible. --- ChangeLog | 14 ++++++++++ math/s_fma.c | 4 ++- math/s_fmaf.c | 4 ++- sysdeps/x86_64/multiarch/Versions | 5 ++++ sysdeps/x86_64/multiarch/s_fma.c | 43 +++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/s_fmaf.c | 42 ++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/Versions create mode 100644 sysdeps/x86_64/multiarch/s_fma.c create mode 100644 sysdeps/x86_64/multiarch/s_fmaf.c diff --git a/ChangeLog b/ChangeLog index 0273a595c4..0d0120ccb6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,19 @@ 2009-07-29 Ulrich Drepper + * math/s_fma.c: Don't define alias if __fma is a macro. + * math/s_fmaf.c: Likewise. + * sysdeps/x86_64/multiarch/s_fma.c: New file. + * sysdeps/x86_64/multiarch/s_fmaf.c: New file. + Partially based on a patch by H.J. Lu . + + * sysdeps/x86_64/multiarch/init-arch.h (__get_cpu_features): Declare. + (HAS_POPCOUNT, HAS_SSE4_2): Add variants which work outside libc. + New macro HAS_FMA. + * sysdeps/x86_64/multiarch/init-arch.c (__get_cpu_features): New + function. + * include/libc-symbols.h (libm_ifunc): Define. + * sysdeps/x86_64/multiarch/Versions: New file. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI. 2009-07-28 H.J. Lu diff --git a/math/s_fma.c b/math/s_fma.c index e5ff5a7228..476d1fe44c 100644 --- a/math/s_fma.c +++ b/math/s_fma.c @@ -1,5 +1,5 @@ /* Compute x * y + z as ternary operation. - Copyright (C) 1997, 2001 Free Software Foundation, Inc. + Copyright (C) 1997, 2001, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -25,7 +25,9 @@ __fma (double x, double y, double z) { return (x * y) + z; } +#ifndef __fma weak_alias (__fma, fma) +#endif #ifdef NO_LONG_DOUBLE strong_alias (__fma, __fmal) diff --git a/math/s_fmaf.c b/math/s_fmaf.c index caa7f3afe8..357296d70d 100644 --- a/math/s_fmaf.c +++ b/math/s_fmaf.c @@ -1,5 +1,5 @@ /* Compute x * y + z as ternary operation. - Copyright (C) 1997 Free Software Foundation, Inc. + Copyright (C) 1997, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -25,4 +25,6 @@ __fmaf (float x, float y, float z) { return (x * y) + z; } +#ifndef __fmaf weak_alias (__fmaf, fmaf) +#endif diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions new file mode 100644 index 0000000000..59b185ac8d --- /dev/null +++ b/sysdeps/x86_64/multiarch/Versions @@ -0,0 +1,5 @@ +libc { + GLIBC_PRIVATE { + __get_cpu_features; + } +} diff --git a/sysdeps/x86_64/multiarch/s_fma.c b/sysdeps/x86_64/multiarch/s_fma.c new file mode 100644 index 0000000000..40601e9a68 --- /dev/null +++ b/sysdeps/x86_64/multiarch/s_fma.c @@ -0,0 +1,43 @@ +/* FMA version of fma. + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include + +#ifdef HAVE_AVX_SUPPORT + +extern double __fma_sse2 (double x, double y, double z); + + +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + +libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_sse2); +weak_alias (__fma, fma) + +# define __fma __fma_sse2 +#endif + +#include diff --git a/sysdeps/x86_64/multiarch/s_fmaf.c b/sysdeps/x86_64/multiarch/s_fmaf.c new file mode 100644 index 0000000000..f3d37f8f4a --- /dev/null +++ b/sysdeps/x86_64/multiarch/s_fmaf.c @@ -0,0 +1,42 @@ +/* FMA version of fmaf. + Copyright (C) 2009 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include + +#ifdef HAVE_AVX_SUPPORT + +extern float __fmaf_sse2 (float x, float y, float z); + + +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + +libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_sse2); +weak_alias (__fmaf, fmaf) + +# define __fmaf __fmaf_sse2 +#endif + +#include