Skip to content

Commit

Permalink
Enable SSE2 memset for AMD'supcoming Orochi processor.
Browse files Browse the repository at this point in the history
This patch enables SSE2 memset for AMD's upcoming Orochi processor.
This patch also fixes the following bug:
For misaligned blocks larger than > 144 Bytes, memset branches into
the integer code path depending on the value of misalignment even if
the startup code chooses the SSE2 code path upfront, when multiarch
is enabled.
  • Loading branch information
Harsha Jagasia authored and Ulrich Drepper committed Mar 5, 2011
1 parent 13a804d commit 7e4ba49
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 51 deletions.
16 changes: 15 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com>
Ulrich Drepper <drepper@gmail.com>

* sysdeps/x86_64/memset.S: After aligning destination, code
branches to different locations depending on the value of
misalignment, when multiarch is enabled. Fix this.

2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com>

* sysdeps/x86_64/cacheinfo.c (init_cacheinfo):
Set _x86_64_preferred_memory_instruction for AMD processsors.
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
Set bit_Prefer_SSE_for_memop for AMD processors.

2011-03-04 Ulrich Drepper <drepper@gmail.com>

* libio/fmemopen.c (fmemopen): Optimize a bit.
Expand All @@ -12,7 +26,7 @@

2011-02-28 Aurelien Jarno <aurelien@aurel32.net>

* sysdeps/sparc/sparc64/multiarch/memset.S(__bzero): call
* sysdeps/sparc/sparc64/multiarch/memset.S(__bzero): Call
__bzero_ultra1 instead of __memset_ultra1.

2011-02-23 Andreas Schwab <schwab@redhat.com>
Expand Down
49 changes: 34 additions & 15 deletions sysdeps/x86_64/cacheinfo.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* x86_64 cache info.
Copyright (C) 2003, 2004, 2006, 2007, 2009 Free Software Foundation, Inc.
Copyright (C) 2003,2004,2006,2007,2009,2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
Expand Down Expand Up @@ -352,11 +352,11 @@ handle_amd (int name)

case _SC_LEVEL2_CACHE_ASSOC:
switch ((ecx >> 12) & 0xf)
{
case 0:
case 1:
case 2:
case 4:
{
case 0:
case 1:
case 2:
case 4:
return (ecx >> 12) & 0xf;
case 6:
return 8;
Expand All @@ -376,7 +376,7 @@ handle_amd (int name)
return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
default:
return 0;
}
}
/* NOTREACHED */

case _SC_LEVEL2_CACHE_LINESIZE:
Expand Down Expand Up @@ -521,10 +521,10 @@ init_cacheinfo (void)
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);

if (shared <= 0)
{
{
/* Try L2 otherwise. */
level = 2;
shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
level = 2;
shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
}

unsigned int ebx_1;
Expand All @@ -540,7 +540,7 @@ init_cacheinfo (void)

#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
/* Intel prefers SSSE3 instructions for memory/string routines
if they are avaiable. */
if they are available. */
if ((ecx & 0x200))
__x86_64_preferred_memory_instruction = 3;
else
Expand All @@ -550,7 +550,7 @@ init_cacheinfo (void)
/* Figure out the number of logical threads that share the
highest cache level. */
if (max_cpuid >= 4)
{
{
int i = 0;

/* Query until desired cache level is enumerated. */
Expand All @@ -565,7 +565,7 @@ init_cacheinfo (void)
if ((eax & 0x1f) == 0)
goto intel_bug_no_cache_info;
}
while (((eax >> 5) & 0x7) != level);
while (((eax >> 5) & 0x7) != level);

threads = (eax >> 14) & 0x3ff;

Expand Down Expand Up @@ -602,7 +602,7 @@ init_cacheinfo (void)
threads += 1;
}
else
{
{
intel_bug_no_cache_info:
/* Assume that all logical threads share the highest cache level. */

Expand All @@ -612,7 +612,7 @@ init_cacheinfo (void)
/* Cap usage of highest cache level to the number of supported
threads. */
if (shared > 0 && threads > 0)
shared /= threads;
shared /= threads;
}
/* This spells out "AuthenticAMD". */
else if (is_amd)
Expand All @@ -621,6 +621,25 @@ init_cacheinfo (void)
long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);

#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
# ifdef USE_MULTIARCH
eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
# else
__cpuid (1, eax, ebx, ecx, edx);
# endif

/* AMD prefers SSSE3 instructions for memory/string routines
if they are avaiable, otherwise it prefers integer
instructions. */
if ((ecx & 0x200))
__x86_64_preferred_memory_instruction = 3;
else
__x86_64_preferred_memory_instruction = 0;
#endif

/* Get maximum extended function. */
__cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);

Expand Down
68 changes: 35 additions & 33 deletions sysdeps/x86_64/memset.S
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* memset/bzero -- set memory area to CH/0
Optimized version for x86-64.
Copyright (C) 2002-2005, 2007, 2008 Free Software Foundation, Inc.
Copyright (C) 2002-2005, 2007, 2008, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
Expand All @@ -23,7 +23,7 @@
#define __STOS_LOWER_BOUNDARY $8192
#define __STOS_UPPER_BOUNDARY $65536

.text
.text
#if !defined NOT_IN_libc && !defined USE_MULTIARCH
ENTRY(__bzero)
mov %rsi,%rdx /* Adjust parameter. */
Expand Down Expand Up @@ -417,7 +417,7 @@ L(P4Q0): mov %edx,-0x4(%rdi)
retq

.balign 16
#if defined(USE_EXTRA_TABLE)
#ifdef USE_EXTRA_TABLE
L(P5QI): mov %rdx,-0x95(%rdi)
#endif
L(P5QH): mov %rdx,-0x8d(%rdi)
Expand Down Expand Up @@ -596,6 +596,8 @@ L(A6Q0): mov %dx,-0x6(%rdi)
jmp L(aligned_now)

L(SSE_pre):
#else
L(aligned_now):
#endif
#if !defined USE_MULTIARCH || defined USE_SSE2
# fill RegXMM0 with the pattern
Expand All @@ -606,16 +608,16 @@ L(SSE_pre):
jge L(byte32sse2_pre)

add %r8,%rdi
#ifndef PIC
# ifndef PIC
lea L(SSExDx)(%rip),%r9
jmpq *(%r9,%r8,8)
#else
# else
lea L(SSE0Q0)(%rip),%r9
lea L(SSExDx)(%rip),%rcx
movswq (%rcx,%r8,2),%rcx
lea (%rcx,%r9,1),%r9
jmpq *%r9
#endif
# endif

L(SSE0QB): movdqa %xmm0,-0xb0(%rdi)
L(SSE0QA): movdqa %xmm0,-0xa0(%rdi)
Expand Down Expand Up @@ -881,16 +883,16 @@ L(byte32sse2):
lea 0x80(%rdi),%rdi
jge L(byte32sse2)
add %r8,%rdi
#ifndef PIC
# ifndef PIC
lea L(SSExDx)(%rip),%r11
jmpq *(%r11,%r8,8)
#else
# else
lea L(SSE0Q0)(%rip),%r11
lea L(SSExDx)(%rip),%rcx
movswq (%rcx,%r8,2),%rcx
lea (%rcx,%r11,1),%r11
jmpq *%r11
#endif
# endif

.balign 16
L(sse2_nt_move_pre):
Expand All @@ -916,20 +918,20 @@ L(sse2_nt_move):
jge L(sse2_nt_move)
sfence
add %r8,%rdi
#ifndef PIC
# ifndef PIC
lea L(SSExDx)(%rip),%r11
jmpq *(%r11,%r8,8)
#else
# else
lea L(SSE0Q0)(%rip),%r11
lea L(SSExDx)(%rip),%rcx
movswq (%rcx,%r8,2),%rcx
lea (%rcx,%r11,1),%r11
jmpq *%r11
#endif
# endif

.pushsection .rodata
.balign 16
#ifndef PIC
# ifndef PIC
L(SSExDx):
.quad L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0)
.quad L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0)
Expand Down Expand Up @@ -979,7 +981,7 @@ L(SSExDx):
.quad L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB)
.quad L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB)
.quad L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB)
#else
# else
L(SSExDx):
.short L(SSE0Q0) -L(SSE0Q0)
.short L(SSE1Q0) -L(SSE0Q0)
Expand Down Expand Up @@ -1196,14 +1198,14 @@ L(SSExDx):
.short L(SSE13QB)-L(SSE0Q0)
.short L(SSE14QB)-L(SSE0Q0)
.short L(SSE15QB)-L(SSE0Q0)
#endif
# endif
.popsection
#endif /* !defined USE_MULTIARCH || defined USE_SSE2 */

.balign 16
#ifndef USE_MULTIARCH
L(aligned_now):

#ifndef USE_MULTIARCH
cmpl $0x1,__x86_64_preferred_memory_instruction(%rip)
jg L(SSE_pre)
#endif /* USE_MULTIARCH */
Expand Down Expand Up @@ -1246,17 +1248,17 @@ L(8byte_move_loop):

L(8byte_move_skip):
andl $127,%r8d
lea (%rdi,%r8,1),%rdi
lea (%rdi,%r8,1),%rdi

#ifndef PIC
lea L(setPxQx)(%rip),%r11
jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
lea L(setPxQx)(%rip),%r11
jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
#else
lea L(Got0)(%rip),%r11
lea L(Got0)(%rip),%r11
lea L(setPxQx)(%rip),%rcx
movswq (%rcx,%r8,2),%rcx
lea (%rcx,%r11,1),%r11
jmpq *%r11
lea (%rcx,%r11,1),%r11
jmpq *%r11
#endif

.balign 16
Expand Down Expand Up @@ -1290,16 +1292,16 @@ L(8byte_stos_skip):
ja L(8byte_nt_move)

andl $7,%r8d
lea (%rdi,%r8,1),%rdi
lea (%rdi,%r8,1),%rdi
#ifndef PIC
lea L(setPxQx)(%rip),%r11
jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
lea L(setPxQx)(%rip),%r11
jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
#else
lea L(Got0)(%rip),%r11
lea L(Got0)(%rip),%r11
lea L(setPxQx)(%rip),%rcx
movswq (%rcx,%r8,2),%rcx
lea (%rcx,%r11,1),%r11
jmpq *%r11
jmpq *%r11
#endif

.balign 16
Expand Down Expand Up @@ -1338,16 +1340,16 @@ L(8byte_nt_move_loop):
L(8byte_nt_move_skip):
andl $127,%r8d

lea (%rdi,%r8,1),%rdi
lea (%rdi,%r8,1),%rdi
#ifndef PIC
lea L(setPxQx)(%rip),%r11
jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
lea L(setPxQx)(%rip),%r11
jmpq *(%r11,%r8,8) # old scheme remained for nonPIC
#else
lea L(Got0)(%rip),%r11
lea L(Got0)(%rip),%r11
lea L(setPxQx)(%rip),%rcx
movswq (%rcx,%r8,2),%rcx
lea (%rcx,%r11,1),%r11
jmpq *%r11
lea (%rcx,%r11,1),%r11
jmpq *%r11
#endif

END (memset)
Expand Down
12 changes: 10 additions & 2 deletions sysdeps/x86_64/multiarch/init-arch.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* Initialize CPU feature data.
This file is part of the GNU C Library.
Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@redhat.com>.
The GNU C Library is free software; you can redistribute it and/or
Expand Down Expand Up @@ -60,7 +60,7 @@ __init_cpu_features (void)
get_common_indeces (&family, &model);

/* Intel processors prefer SSE instruction for memory/string
routines if they are avaiable. */
routines if they are available. */
__cpu_features.feature[index_Prefer_SSE_for_memop]
|= bit_Prefer_SSE_for_memop;

Expand Down Expand Up @@ -107,6 +107,14 @@ __init_cpu_features (void)
kind = arch_kind_amd;

get_common_indeces (&family, &model);

unsigned int ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;

/* AMD processors prefer SSE instructions for memory/string routines
if they are available, otherwise they prefer integer instructions. */
if ((ecx & 0x200))
__cpu_features.feature[index_Prefer_SSE_for_memop]
|= bit_Prefer_SSE_for_memop;
}
else
kind = arch_kind_other;
Expand Down

0 comments on commit 7e4ba49

Please sign in to comment.