Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 19768
b: refs/heads/master
c: 7bcd3f3
h: refs/heads/master
v: v3
  • Loading branch information
Andi Kleen authored and Linus Torvalds committed Feb 5, 2006
1 parent 59a3d33 commit b82ff47
Show file tree
Hide file tree
Showing 8 changed files with 544 additions and 25 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 6bca52b544489b626c7d0db801df6b4aa3d5adb5
refs/heads/master: 7bcd3f34e262bbebffa954d80eab3a84f053da31
6 changes: 6 additions & 0 deletions trunk/arch/x86_64/kernel/setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
static int __init init_amd(struct cpuinfo_x86 *c)
{
int r;
unsigned level;

#ifdef CONFIG_SMP
unsigned long value;
Expand All @@ -899,6 +900,11 @@ static int __init init_amd(struct cpuinfo_x86 *c)
3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
clear_bit(0*32+31, &c->x86_capability);

/* On C+ stepping K8 rep microcode works well for copy/memset */
level = cpuid_eax(1);
if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);

r = get_model_name(c);
if (!r) {
switch (c->x86) {
Expand Down
38 changes: 38 additions & 0 deletions trunk/arch/x86_64/lib/clear_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,46 @@
.globl clear_page
.p2align 4
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
PUT(2)
PUT(3)
PUT(4)
PUT(5)
PUT(6)
PUT(7)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
ret
clear_page_end:

/* Some CPUs run faster using the string instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad clear_page
.quad clear_page_c
.byte X86_FEATURE_REP_GOOD
.byte clear_page_end-clear_page
.byte clear_page_c_end-clear_page_c
.previous

.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
clear_page_c_end:
.previous
87 changes: 87 additions & 0 deletions trunk/arch/x86_64/lib/copy_page.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,94 @@
.globl copy_page
.p2align 4
copy_page:
subq $3*8,%rsp
movq %rbx,(%rsp)
movq %r12,1*8(%rsp)
movq %r13,2*8(%rsp)

movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

prefetcht0 5*64(%rsi)

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi

jnz .Loop64

movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx

movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12

movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)

leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi

jnz .Loop2

movq (%rsp),%rbx
movq 1*8(%rsp),%r12
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret

/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_REP_GOOD
.byte copy_page_c_end-copy_page_c
.byte copy_page_c_end-copy_page_c
.previous

.section .altinstr_replacement,"ax"
copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
copy_page_c_end:
.previous
Loading

0 comments on commit b82ff47

Please sign in to comment.