Skip to content

Commit

Permalink
powerpc: Optimise the 64bit optimised __clear_user
Browse files Browse the repository at this point in the history
I blame Mikey for this. He elevated my slightly dubious testcase:

to benchmark status. And naturally we need to be number 1 at creating
zeros. So lets improve __clear_user some more.

As Paul suggests we can use dcbz for large lengths. This patch gets
the destination cacheline aligned then uses dcbz on whole cachelines.

Before:
10485760000 bytes (10 GB) copied, 0.414744 s, 25.3 GB/s

After:
10485760000 bytes (10 GB) copied, 0.268597 s, 39.0 GB/s

39 GB/s, a new record.

Signed-off-by: Anton Blanchard <anton@samba.org>
Tested-by: Olof Johansson <olof@lixom.net>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
  • Loading branch information
Anton Blanchard authored and Benjamin Herrenschmidt committed Jul 3, 2012
1 parent b4c3a87 commit cf8fb55
Showing 1 changed file with 62 additions and 1 deletion.
63 changes: 62 additions & 1 deletion arch/powerpc/lib/string_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
*/

#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>

.section ".toc","aw"
PPC64_CACHES:
.tc ppc64_caches[TC],ppc64_caches
.section ".text"

/**
* __clear_user: - Zero a block of memory in user space, with less checking.
Expand Down Expand Up @@ -94,9 +100,14 @@ err1; stw r0,0(r3)
addi r3,r3,4

3: sub r4,r4,r6
srdi r6,r4,5

cmpdi r4,32
cmpdi cr1,r4,512
blt .Lshort_clear
bgt cr1,.Llong_clear

.Lmedium_clear:
srdi r6,r4,5
mtctr r6

/* Do 32 byte chunks */
Expand Down Expand Up @@ -139,3 +150,53 @@ err1; stb r0,0(r3)

10: li r3,0
blr

.Llong_clear:
ld r5,PPC64_CACHES@toc(r2)

bf cr7*4+0,11f
err2; std r0,0(r3)
addi r3,r3,8
addi r4,r4,-8

/* Destination is 16 byte aligned, need to get it cacheline aligned */
11: lwz r7,DCACHEL1LOGLINESIZE(r5)
lwz r9,DCACHEL1LINESIZE(r5)

/*
* With worst case alignment the long clear loop takes a minimum
* of 1 byte less than 2 cachelines.
*/
sldi r10,r9,2
cmpd r4,r10
blt .Lmedium_clear

neg r6,r3
addi r10,r9,-1
and. r5,r6,r10
beq 13f

srdi r6,r5,4
mtctr r6
mr r8,r3
12:
err1; std r0,0(r3)
err1; std r0,8(r3)
addi r3,r3,16
bdnz 12b

sub r4,r4,r5

13: srd r6,r4,r7
mtctr r6
mr r8,r3
14:
err1; dcbz r0,r3
add r3,r3,r9
bdnz 14b

and r4,r4,r10

cmpdi r4,32
blt .Lshort_clear
b .Lmedium_clear

0 comments on commit cf8fb55

Please sign in to comment.