-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
sparc64: Niagara-4 bzero/memset, plus use MRU stores in page copy.
This adds optimized memset/bzero/page-clear routines for Niagara-4. We basically can do what powerpc has been able to do for a decade (via the "dcbz" instruction), which is use cache line clearing stores for bzero and memsets with a 'c' argument of zero. As long as we make the cache initializing store to each 32-byte subblock of the L2 cache line, it works. As with other Niagara-4 optimized routines, the key is to make sure to avoid any usage of the %asi register, as reads and writes to it cost at least 50 cycles. For the user clear cases, we don't use these new routines, we use the Niagara-1 variants instead. Those have to use %asi in an unavoidable way. A Niagara-4 8K page clear costs just under 600 cycles. Add definitions of the MRU variants of the cache initializing store ASIs. By default, cache initializing stores install the line as Least Recently Used. If we know we're going to use the data immediately (which is true for page copies and clears) we can use the Most Recently Used variant, to decrease the likelyhood of the lines being evicted before they get used. Signed-off-by: David S. Miller <davem@davemloft.net>
- Loading branch information
David S. Miller
committed
Oct 5, 2012
1 parent
ffa9009
commit 9f82596
Showing
7 changed files
with
176 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
/* NG4copy_page.S: Niagara-4 optimized clear page. | ||
* | ||
* Copyright (C) 2012 (davem@davemloft.net) | ||
*/ | ||
|
||
#include <asm/asi.h> | ||
#include <asm/page.h> | ||
|
||
.text | ||
|
||
.register %g3, #scratch | ||
|
||
.align 32 | ||
.globl NG4clear_page | ||
.globl NG4clear_user_page | ||
NG4clear_page: /* %o0=dest */ | ||
NG4clear_user_page: /* %o0=dest, %o1=vaddr */ | ||
set PAGE_SIZE, %g7 | ||
mov 0x20, %g3 | ||
1: stxa %g0, [%o0 + %g0] ASI_ST_BLKINIT_MRU_P | ||
subcc %g7, 0x40, %g7 | ||
stxa %g0, [%o0 + %g3] ASI_ST_BLKINIT_MRU_P | ||
bne,pt %xcc, 1b | ||
add %o0, 0x40, %o0 | ||
membar #StoreLoad|#StoreStore | ||
retl | ||
nop | ||
.size NG4clear_page,.-NG4clear_page | ||
.size NG4clear_user_page,.-NG4clear_user_page |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* NG4memset.S: Niagara-4 optimized memset/bzero. | ||
* | ||
* Copyright (C) 2012 David S. Miller (davem@davemloft.net) | ||
*/ | ||
|
||
#include <asm/asi.h> | ||
|
||
.register %g2, #scratch | ||
.register %g3, #scratch | ||
|
||
.text | ||
.align 32 | ||
.globl NG4memset | ||
NG4memset: | ||
andcc %o1, 0xff, %o4 | ||
be,pt %icc, 1f | ||
mov %o2, %o1 | ||
sllx %o4, 8, %g1 | ||
or %g1, %o4, %o2 | ||
sllx %o2, 16, %g1 | ||
or %g1, %o2, %o2 | ||
sllx %o2, 32, %g1 | ||
ba,pt %icc, 1f | ||
or %g1, %o2, %o4 | ||
.size NG4memset,.-NG4memset | ||
|
||
.align 32 | ||
.globl NG4bzero | ||
NG4bzero: | ||
clr %o4 | ||
1: cmp %o1, 16 | ||
ble %icc, .Ltiny | ||
mov %o0, %o3 | ||
sub %g0, %o0, %g1 | ||
and %g1, 0x7, %g1 | ||
brz,pt %g1, .Laligned8 | ||
sub %o1, %g1, %o1 | ||
1: stb %o4, [%o0 + 0x00] | ||
subcc %g1, 1, %g1 | ||
bne,pt %icc, 1b | ||
add %o0, 1, %o0 | ||
.Laligned8: | ||
cmp %o1, 64 + (64 - 8) | ||
ble .Lmedium | ||
sub %g0, %o0, %g1 | ||
andcc %g1, (64 - 1), %g1 | ||
brz,pn %g1, .Laligned64 | ||
sub %o1, %g1, %o1 | ||
1: stx %o4, [%o0 + 0x00] | ||
subcc %g1, 8, %g1 | ||
bne,pt %icc, 1b | ||
add %o0, 0x8, %o0 | ||
.Laligned64: | ||
andn %o1, 64 - 1, %g1 | ||
sub %o1, %g1, %o1 | ||
brnz,pn %o4, .Lnon_bzero_loop | ||
mov 0x20, %g2 | ||
1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
subcc %g1, 0x40, %g1 | ||
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
bne,pt %icc, 1b | ||
add %o0, 0x40, %o0 | ||
.Lpostloop: | ||
cmp %o1, 8 | ||
bl,pn %icc, .Ltiny | ||
membar #StoreStore|#StoreLoad | ||
.Lmedium: | ||
andn %o1, 0x7, %g1 | ||
sub %o1, %g1, %o1 | ||
1: stx %o4, [%o0 + 0x00] | ||
subcc %g1, 0x8, %g1 | ||
bne,pt %icc, 1b | ||
add %o0, 0x08, %o0 | ||
andcc %o1, 0x4, %g1 | ||
be,pt %icc, .Ltiny | ||
sub %o1, %g1, %o1 | ||
stw %o4, [%o0 + 0x00] | ||
add %o0, 0x4, %o0 | ||
.Ltiny: | ||
cmp %o1, 0 | ||
be,pn %icc, .Lexit | ||
1: subcc %o1, 1, %o1 | ||
stb %o4, [%o0 + 0x00] | ||
bne,pt %icc, 1b | ||
add %o0, 1, %o0 | ||
.Lexit: | ||
retl | ||
mov %o3, %o0 | ||
.Lnon_bzero_loop: | ||
mov 0x08, %g3 | ||
mov 0x28, %o5 | ||
1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
subcc %g1, 0x40, %g1 | ||
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P | ||
add %o0, 0x10, %o0 | ||
stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P | ||
bne,pt %icc, 1b | ||
add %o0, 0x30, %o0 | ||
ba,a,pt %icc, .Lpostloop | ||
.size NG4bzero,.-NG4bzero |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters