Skip to content

Commit

Permalink
powerpc/64s: Improve RFI L1-D cache flush fallback
Browse files Browse the repository at this point in the history
The fallback RFI flush is used when firmware does not provide a way
to flush the cache. It's a "displacement flush" that evicts useful
data by displacing it with an uninteresting buffer.

The flush has to take care to work with implementation specific cache
replacment policies, so the recipe has been in flux. The initial
slow but conservative approach is to touch all lines of a congruence
class, with dependencies between each load. It has since been
determined that a linear pattern of loads without dependencies is
sufficient, and is significantly faster.

Measuring the speed of a null syscall with RFI fallback flush enabled
gives the relative improvement:

P8 - 1.83x
P9 - 1.75x

The flush also becomes simpler and more adaptable to different cache
geometries.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
  • Loading branch information
Nicholas Piggin authored and Michael Ellerman committed Jan 23, 2018
1 parent 35adacd commit bdcb1ae
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 58 deletions.
3 changes: 1 addition & 2 deletions arch/powerpc/include/asm/paca.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,7 @@ struct paca_struct {
*/
u64 exrfi[EX_SIZE] __aligned(0x80);
void *rfi_flush_fallback_area;
u64 l1d_flush_congruence;
u64 l1d_flush_sets;
u64 l1d_flush_size;
#endif
};

Expand Down
3 changes: 1 addition & 2 deletions arch/powerpc/kernel/asm-offsets.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,7 @@ int main(void)
OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
OFFSET(PACA_EXRFI, paca_struct, exrfi);
OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
OFFSET(PACA_L1D_FLUSH_SIZE, paca_struct, l1d_flush_size);

#endif
OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
Expand Down
76 changes: 36 additions & 40 deletions arch/powerpc/kernel/exceptions-64s.S
Original file line number Diff line number Diff line change
Expand Up @@ -1461,39 +1461,37 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback)
std r9,PACA_EXRFI+EX_R9(r13)
std r10,PACA_EXRFI+EX_R10(r13)
std r11,PACA_EXRFI+EX_R11(r13)
std r12,PACA_EXRFI+EX_R12(r13)
std r8,PACA_EXRFI+EX_R13(r13)
mfctr r9
ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
ld r11,PACA_L1D_FLUSH_SETS(r13)
ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
/*
* The load adresses are at staggered offsets within cachelines,
* which suits some pipelines better (on others it should not
* hurt).
*/
addi r12,r12,8
ld r11,PACA_L1D_FLUSH_SIZE(r13)
srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
mtctr r11
DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */

/* order ld/st prior to dcbt stop all streams with flushing */
sync
1: li r8,0
.rept 8 /* 8-way set associative */
ldx r11,r10,r8
add r8,r8,r12
xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not
add r8,r8,r11 // Add 0, this creates a dependency on the ldx
.endr
addi r10,r10,128 /* 128 byte cache line */

/*
* The load adresses are at staggered offsets within cachelines,
* which suits some pipelines better (on others it should not
* hurt).
*/
1:
ld r11,(0x80 + 8)*0(r10)
ld r11,(0x80 + 8)*1(r10)
ld r11,(0x80 + 8)*2(r10)
ld r11,(0x80 + 8)*3(r10)
ld r11,(0x80 + 8)*4(r10)
ld r11,(0x80 + 8)*5(r10)
ld r11,(0x80 + 8)*6(r10)
ld r11,(0x80 + 8)*7(r10)
addi r10,r10,0x80*8
bdnz 1b

mtctr r9
ld r9,PACA_EXRFI+EX_R9(r13)
ld r10,PACA_EXRFI+EX_R10(r13)
ld r11,PACA_EXRFI+EX_R11(r13)
ld r12,PACA_EXRFI+EX_R12(r13)
ld r8,PACA_EXRFI+EX_R13(r13)
GET_SCRATCH0(r13);
rfid

Expand All @@ -1503,39 +1501,37 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback)
std r9,PACA_EXRFI+EX_R9(r13)
std r10,PACA_EXRFI+EX_R10(r13)
std r11,PACA_EXRFI+EX_R11(r13)
std r12,PACA_EXRFI+EX_R12(r13)
std r8,PACA_EXRFI+EX_R13(r13)
mfctr r9
ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
ld r11,PACA_L1D_FLUSH_SETS(r13)
ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
/*
* The load adresses are at staggered offsets within cachelines,
* which suits some pipelines better (on others it should not
* hurt).
*/
addi r12,r12,8
ld r11,PACA_L1D_FLUSH_SIZE(r13)
srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
mtctr r11
DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */

/* order ld/st prior to dcbt stop all streams with flushing */
sync
1: li r8,0
.rept 8 /* 8-way set associative */
ldx r11,r10,r8
add r8,r8,r12
xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not
add r8,r8,r11 // Add 0, this creates a dependency on the ldx
.endr
addi r10,r10,128 /* 128 byte cache line */

/*
* The load adresses are at staggered offsets within cachelines,
* which suits some pipelines better (on others it should not
* hurt).
*/
1:
ld r11,(0x80 + 8)*0(r10)
ld r11,(0x80 + 8)*1(r10)
ld r11,(0x80 + 8)*2(r10)
ld r11,(0x80 + 8)*3(r10)
ld r11,(0x80 + 8)*4(r10)
ld r11,(0x80 + 8)*5(r10)
ld r11,(0x80 + 8)*6(r10)
ld r11,(0x80 + 8)*7(r10)
addi r10,r10,0x80*8
bdnz 1b

mtctr r9
ld r9,PACA_EXRFI+EX_R9(r13)
ld r10,PACA_EXRFI+EX_R10(r13)
ld r11,PACA_EXRFI+EX_R11(r13)
ld r12,PACA_EXRFI+EX_R12(r13)
ld r8,PACA_EXRFI+EX_R13(r13)
GET_SCRATCH0(r13);
hrfid

Expand Down
13 changes: 1 addition & 12 deletions arch/powerpc/kernel/setup_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -875,19 +875,8 @@ static void init_fallback_flush(void)
memset(l1d_flush_fallback_area, 0, l1d_size * 2);

for_each_possible_cpu(cpu) {
/*
* The fallback flush is currently coded for 8-way
* associativity. Different associativity is possible, but it
* will be treated as 8-way and may not evict the lines as
* effectively.
*
* 128 byte lines are mandatory.
*/
u64 c = l1d_size / 8;

paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
paca[cpu].l1d_flush_congruence = c;
paca[cpu].l1d_flush_sets = c / 128;
paca[cpu].l1d_flush_size = l1d_size;
}
}

Expand Down
2 changes: 0 additions & 2 deletions arch/powerpc/xmon/xmon.c
Original file line number Diff line number Diff line change
Expand Up @@ -2377,8 +2377,6 @@ static void dump_one_paca(int cpu)
printf(" slb_cache[%d]: = 0x%016lx\n", i, p->slb_cache[i]);

DUMP(p, rfi_flush_fallback_area, "px");
DUMP(p, l1d_flush_congruence, "llx");
DUMP(p, l1d_flush_sets, "llx");
#endif
DUMP(p, dscr_default, "llx");
#ifdef CONFIG_PPC_BOOK3E
Expand Down

0 comments on commit bdcb1ae

Please sign in to comment.