Skip to content

Commit

Permalink
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/mems…
Browse files Browse the repository at this point in the history
…et functions

They cause quite bad performance regressions on Netburst
This is temporary until we can get new optimized functions
for these CPUs.

This undoes changes that were done in 2.6.15 and in 2.6.16-rc1,
essentially bringing the code back to 2.6.14 level. Only change
is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD
and fixed the check for the flag and also fixed some comments.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Andi Kleen authored and Linus Torvalds committed Feb 5, 2006
1 parent 6bca52b commit 7bcd3f3
Show file tree
Hide file tree
Showing 7 changed files with 543 additions and 24 deletions.
6 changes: 6 additions & 0 deletions arch/x86_64/kernel/setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
static int __init init_amd(struct cpuinfo_x86 *c)
{
int r;
unsigned level;

#ifdef CONFIG_SMP
unsigned long value;
Expand All @@ -899,6 +900,11 @@ static int __init init_amd(struct cpuinfo_x86 *c)
3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
clear_bit(0*32+31, &c->x86_capability);

/* On C+ stepping K8 rep microcode works well for copy/memset */
level = cpuid_eax(1);
if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);

r = get_model_name(c);
if (!r) {
switch (c->x86) {
Expand Down
38 changes: 38 additions & 0 deletions arch/x86_64/lib/clear_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,46 @@
.globl clear_page
.p2align 4
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
PUT(2)
PUT(3)
PUT(4)
PUT(5)
PUT(6)
PUT(7)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
ret
clear_page_end:

/* Some CPUs run faster using the string instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad clear_page
.quad clear_page_c
.byte X86_FEATURE_REP_GOOD
.byte clear_page_end-clear_page
.byte clear_page_c_end-clear_page_c
.previous

.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
clear_page_c_end:
.previous
87 changes: 87 additions & 0 deletions arch/x86_64/lib/copy_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,94 @@
.globl copy_page
.p2align 4
copy_page:
subq $3*8,%rsp
movq %rbx,(%rsp)
movq %r12,1*8(%rsp)
movq %r13,2*8(%rsp)

movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

prefetcht0 5*64(%rsi)

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi

jnz .Loop64

movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi

jnz .Loop2

movq (%rsp),%rbx
movq 1*8(%rsp),%r12
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret

/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_REP_GOOD
.byte copy_page_c_end-copy_page_c
.byte copy_page_c_end-copy_page_c
.previous

.section .altinstr_replacement,"ax"
copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
copy_page_c_end:
.previous
Loading

0 comments on commit 7bcd3f3

Please sign in to comment.