Skip to content

Commit

Permalink
Preserve SSE registers in runtime relocations on x86-64.
Browse files Browse the repository at this point in the history
SSE registers are used for passing parameters and must be preserved
in runtime relocations.  This is inside ld.so enforced through the
tests in tst-xmmymm.sh.  But the malloc routines used after startup
come from libc.so and can be arbitrarily complex.  It's overkill
to save the SSE registers all the time because of that.  These calls
are rare.  Instead we save them on demand.  The new infrastructure
put in place in this patch makes this possible and efficient.
  • Loading branch information
Ulrich Drepper committed Jul 29, 2009
1 parent 9655389 commit b48a267
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 21 deletions.
15 changes: 15 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
2009-07-29 Ulrich Drepper <drepper@redhat.com>

* elf/dl-runtime.c (_dl_fixup): Indicate before _dl_lookup_symbol_x
call that registers used in calling conventions need to be preserved.
* elf/dl-lookup.c (do_lookup_x): Use RTLD_*_FOREIGN_CALL macros
to preserve register content if necessary.
* sysdeps/x86_64/dl-trampoline.S (_dl_x86_64_save_sse): New function.
(_dl_x86_64_restore_sse): New function.
* sysdeps/x86_64/tst-xmmymm.sh: There is now one more function that
is allowed to modify xmm/ymm registers.

* stdio-common/scanf15.c: Undefine _LIBC. We want to test from an
application's perspective.
* stdio-common/scanf17.c: Likewise.

2009-07-28 Ulrich Drepper <drepper@redhat.com>

* csu/libc-tls.c (__libc_setup_tls) [TLS_TCB_AT_TP]: Don't add TCB
Expand Down
13 changes: 13 additions & 0 deletions elf/dl-lookup.c
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,10 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
if (size * 3 <= tab->n_elements * 4)
{
/* Expand the table. */
#ifdef RTLD_CHECK_FOREIGN_CALL
/* This must not happen during runtime relocations. */
assert (!RTLD_CHECK_FOREIGN_CALL);
#endif
size_t newsize = _dl_higher_prime_number (size + 1);
struct unique_sym *newentries
= calloc (sizeof (struct unique_sym), newsize);
Expand All @@ -405,6 +409,11 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
}
else
{
#ifdef RTLD_CHECK_FOREIGN_CALL
/* This must not happen during runtime relocations. */
assert (!RTLD_CHECK_FOREIGN_CALL);
#endif

#define INITIAL_NUNIQUE_SYM_TABLE 31
size = INITIAL_NUNIQUE_SYM_TABLE;
entries = calloc (sizeof (struct unique_sym), size);
Expand Down Expand Up @@ -600,6 +609,10 @@ add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
unsigned int max
= undef_map->l_reldepsmax ? undef_map->l_reldepsmax * 2 : 10;

#ifdef RTLD_PREPARE_FOREIGN_CALL
RTLD_PREPARE_FOREIGN_CALL;
#endif

newp = malloc (sizeof (*newp) + max * sizeof (struct link_map *));
if (newp == NULL)
{
Expand Down
8 changes: 8 additions & 0 deletions elf/dl-runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,21 @@ _dl_fixup (
flags |= DL_LOOKUP_GSCOPE_LOCK;
}

#ifdef RTLD_ENABLE_FOREIGN_CALL
RTLD_ENABLE_FOREIGN_CALL;
#endif

result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, l->l_scope,
version, ELF_RTYPE_CLASS_PLT, flags, NULL);

/* We are done with the global scope. */
if (!RTLD_SINGLE_THREAD_P)
THREAD_GSCOPE_RESET_FLAG ();

#ifdef RTLD_FINALIZE_FOREIGN_CALL
RTLD_FINALIZE_FOREIGN_CALL;
#endif

/* Currently result contains the base load address (or link map)
of the object that defines sym. Now add in the symbol
offset. */
Expand Down
8 changes: 8 additions & 0 deletions nptl/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
2009-07-29 Ulrich Drepper <drepper@redhat.com>

* sysdeps/x86_64/tls.h (tcbhead_t): Add room for SSE registers the
dynamic linker might have to save. Define RTLD_CHECK_FOREIGN_CALL,
RTLD_ENABLE_FOREIGN_CALL, RTLD_PREPARE_FOREIGN_CALL, and
RTLD_FINALIZE_FOREIGN_CALL. Pretty printing.
* sysdeps/x86_64/tcb-offsets.sym: Add RTLD_SAVESPACE_SSE.

2009-07-28 Ulrich Drepper <drepper@redhat.com>

* pthread_mutex_lock.c [NO_INCR] (__pthread_mutex_cond_lock_adjust):
Expand Down
1 change: 1 addition & 0 deletions nptl/sysdeps/x86_64/tcb-offsets.sym
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache)
#ifndef __ASSUME_PRIVATE_FUTEX
PRIVATE_FUTEX offsetof (tcbhead_t, private_futex)
#endif
RTLD_SAVESPACE_SSE offsetof (tcbhead_t, rtld_savespace_sse)
73 changes: 55 additions & 18 deletions nptl/sysdeps/x86_64/tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# include <sysdep.h>
# include <kernel-features.h>
# include <bits/wordsize.h>
# include <xmmintrin.h>


/* Type for the dtv. */
Expand All @@ -55,16 +56,23 @@ typedef struct
uintptr_t stack_guard;
uintptr_t pointer_guard;
unsigned long int vgetcpu_cache[2];
#ifndef __ASSUME_PRIVATE_FUTEX
# ifndef __ASSUME_PRIVATE_FUTEX
int private_futex;
#else
# else
int __unused1;
#endif
#if __WORDSIZE == 64
int __pad1;
#endif
# endif
# if __WORDSIZE == 64
int rtld_must_xmm_save;
# endif
/* Reservation of some values for the TM ABI. */
void *__private_tm[5];
# if __WORDSIZE == 64
long int __unused2;
/* Have space for the post-AVX register size. */
__m128 rtld_savespace_sse[8][4];

void *__padding[8];
# endif
} tcbhead_t;

#else /* __ASSEMBLER__ */
Expand Down Expand Up @@ -298,7 +306,7 @@ typedef struct


/* Atomic compare and exchange on TLS, returning old value. */
#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
({ __typeof (descr->member) __ret; \
__typeof (oldval) __old = (oldval); \
if (sizeof (descr->member) == 4) \
Expand All @@ -313,7 +321,7 @@ typedef struct


/* Atomic logical and. */
#define THREAD_ATOMIC_AND(descr, member, val) \
# define THREAD_ATOMIC_AND(descr, member, val) \
(void) ({ if (sizeof ((descr)->member) == 4) \
asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0" \
:: "i" (offsetof (struct pthread, member)), \
Expand All @@ -324,7 +332,7 @@ typedef struct


/* Atomic set bit. */
#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
(void) ({ if (sizeof ((descr)->member) == 4) \
asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0" \
:: "i" (offsetof (struct pthread, member)), \
Expand All @@ -334,7 +342,7 @@ typedef struct
abort (); })


#define CALL_THREAD_FCT(descr) \
# define CALL_THREAD_FCT(descr) \
({ void *__res; \
asm volatile ("movq %%fs:%P2, %%rdi\n\t" \
"callq *%%fs:%P1" \
Expand All @@ -355,18 +363,18 @@ typedef struct


/* Set the pointer guard field in the TCB head. */
#define THREAD_SET_POINTER_GUARD(value) \
# define THREAD_SET_POINTER_GUARD(value) \
THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value)
#define THREAD_COPY_POINTER_GUARD(descr) \
# define THREAD_COPY_POINTER_GUARD(descr) \
((descr)->header.pointer_guard \
= THREAD_GETMEM (THREAD_SELF, header.pointer_guard))


/* Get and set the global scope generation counter in the TCB head. */
#define THREAD_GSCOPE_FLAG_UNUSED 0
#define THREAD_GSCOPE_FLAG_USED 1
#define THREAD_GSCOPE_FLAG_WAIT 2
#define THREAD_GSCOPE_RESET_FLAG() \
# define THREAD_GSCOPE_FLAG_UNUSED 0
# define THREAD_GSCOPE_FLAG_USED 1
# define THREAD_GSCOPE_FLAG_WAIT 2
# define THREAD_GSCOPE_RESET_FLAG() \
do \
{ int __res; \
asm volatile ("xchgl %0, %%fs:%P1" \
Expand All @@ -377,11 +385,40 @@ typedef struct
lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \
} \
while (0)
#define THREAD_GSCOPE_SET_FLAG() \
# define THREAD_GSCOPE_SET_FLAG() \
THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
#define THREAD_GSCOPE_WAIT() \
# define THREAD_GSCOPE_WAIT() \
GL(dl_wait_lookup_done) ()


# ifdef SHARED
/* Defined in dl-trampoline.S. */
extern void _dl_x86_64_save_sse (void);
extern void _dl_x86_64_restore_sse (void);

# define RTLD_CHECK_FOREIGN_CALL \
(THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)

# define RTLD_ENABLE_FOREIGN_CALL \
THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)

# define RTLD_PREPARE_FOREIGN_CALL \
do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save)) \
{ \
_dl_x86_64_save_sse (); \
THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \
} \
while (0)

# define RTLD_FINALIZE_FOREIGN_CALL \
do { \
if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0) \
_dl_x86_64_restore_sse (); \
THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \
} while (0)
# endif


#endif /* __ASSEMBLER__ */

#endif /* tls.h */
1 change: 1 addition & 0 deletions stdio-common/scanf15.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#undef _GNU_SOURCE
#define _XOPEN_SOURCE 600
#undef _LIBC
/* The following macro definitions are a hack. They word around disabling
the GNU extension while still using a few internal headers. */
#define u_char unsigned char
Expand Down
1 change: 1 addition & 0 deletions stdio-common/scanf17.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#undef _GNU_SOURCE
#define _XOPEN_SOURCE 600
#undef _LIBC
/* The following macro definitions are a hack. They word around disabling
the GNU extension while still using a few internal headers. */
#define u_char unsigned char
Expand Down
82 changes: 82 additions & 0 deletions sysdeps/x86_64/dl-trampoline.S
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,85 @@ L(no_avx4):
cfi_endproc
.size _dl_runtime_profile, .-_dl_runtime_profile
#endif


#ifdef SHARED
.globl _dl_x86_64_save_sse
.type _dl_x86_64_save_sse, @function
.align 16
cfi_startproc
_dl_x86_64_save_sse:
# ifdef HAVE_AVX_SUPPORT
cmpl $0, L(have_avx)(%rip)
jne 1f
movq %rbx, %r11 # Save rbx
movl $1, %eax
cpuid
movq %r11,%rbx # Restore rbx
movl $1, %eax
testl $(1 << 28), %ecx
jne 2f
negl %eax
2: movl %eax, L(have_avx)(%rip)
cmpl $0, %eax

1: js L(no_avx5)

# define YMM_SIZE 32
vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
ret
L(no_avx5):
# endif
# define YMM_SIZE 16
movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
movdqa %xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
movdqa %xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
movdqa %xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
movdqa %xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
movdqa %xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
movdqa %xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
movdqa %xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
ret
cfi_endproc
.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse


.globl _dl_x86_64_restore_sse
.type _dl_x86_64_restore_sse, @function
.align 16
cfi_startproc
_dl_x86_64_restore_sse:
# ifdef HAVE_AVX_SUPPORT
cmpl $0, L(have_avx)(%rip)
js L(no_avx6)

vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
ret
L(no_avx6):
# endif
movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
movdqa %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
movdqa %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
movdqa %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
movdqa %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
movdqa %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
movdqa %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
movdqa %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
ret
cfi_endproc
.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
#endif
7 changes: 4 additions & 3 deletions sysdeps/x86_64/tst-xmmymm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,11 @@ for f in $tocheck; do
objdump -d "$objpfx"../*/"$f" |
awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
while read fct; do
if test "$fct" != "_dl_runtime_profile"; then
echo "function $fct in $f modifies xmm/ymm" >> "$tmp"
result=1
if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then
continue;
fi
echo "function $fct in $f modifies xmm/ymm" >> "$tmp"
result=1
done
done

Expand Down

0 comments on commit b48a267

Please sign in to comment.