Skip to content

Commit

Permalink
OMAP3: PM: Update clean_l2 to use v7_flush_dcache_all
Browse files Browse the repository at this point in the history
Analysis in TI kernel with ETM showed that using cache mapped flush
in kernel instead of SO mapped flush cost drops by 65% (3.39mS down
to 1.17mS) for clean_l2 which is used during sleep sequences.
Overall:
	- speed up
	- unfortunately there isn't a good alternative flush method today
	- code reduction and less maintenance and potential bug in
	  unmaintained code

This also fixes the bug with the clean_l2 function usage.

Reported-by: Tony Lindgren <tony@atomide.com>

Cc: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Tony Lindgren <tony@atomide.com>

Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Acked-by: Jean Pihet <j-pihet@ti.com>

[nm@ti.com: ported rkw's proposal to 2.6.37-rc2]
Signed-off-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Richard Woodruff <r-woodruff2@ti.com>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
  • Loading branch information
Richard Woodruff authored and Kevin Hilman committed Dec 21, 2010
1 parent 1cbbe37 commit 0bd4053
Showing 1 changed file with 14 additions and 66 deletions.
80 changes: 14 additions & 66 deletions arch/arm/mach-omap2/sleep34xx.S
Original file line number Diff line number Diff line change
Expand Up @@ -520,72 +520,18 @@ clean_caches:
cmp r9, #1 /* Check whether L2 inval is required or not*/
bne skip_l2_inval
clean_l2:
/* read clidr */
mrc p15, 1, r0, c0, c0, 1
/* extract loc from clidr */
ands r3, r0, #0x7000000
/* left align loc bit field */
mov r3, r3, lsr #23
/* if loc is 0, then no need to clean */
beq finished
/* start clean at cache level 0 */
mov r10, #0
loop1:
/* work out 3x current cache level */
add r2, r10, r10, lsr #1
/* extract cache type bits from clidr*/
mov r1, r0, lsr r2
/* mask of the bits for current cache only */
and r1, r1, #7
/* see what cache we have at this level */
cmp r1, #2
/* skip if no cache, or just i-cache */
blt skip
/* select current cache level in cssr */
mcr p15, 2, r10, c0, c0, 0
/* isb to sych the new cssr&csidr */
isb
/* read the new csidr */
mrc p15, 1, r1, c0, c0, 0
/* extract the length of the cache lines */
and r2, r1, #7
/* add 4 (line length offset) */
add r2, r2, #4
ldr r4, assoc_mask
/* find maximum number on the way size */
ands r4, r4, r1, lsr #3
/* find bit position of way size increment */
clz r5, r4
ldr r7, numset_mask
/* extract max number of the index size*/
ands r7, r7, r1, lsr #13
loop2:
mov r9, r4
/* create working copy of max way size*/
loop3:
/* factor way and cache number into r11 */
orr r11, r10, r9, lsl r5
/* factor index number into r11 */
orr r11, r11, r7, lsl r2
/*clean & invalidate by set/way */
mcr p15, 0, r11, c7, c10, 2
/* decrement the way*/
subs r9, r9, #1
bge loop3
/*decrement the index */
subs r7, r7, #1
bge loop2
skip:
add r10, r10, #2
/* increment cache number */
cmp r3, r10
bgt loop1
finished:
/*swith back to cache level 0 */
mov r10, #0
/* select current cache level in cssr */
mcr p15, 2, r10, c0, c0, 0
isb
/*
* Jump out to kernel flush routine
* - reuse that code is better
* - it executes in a cached space so is faster than refetch per-block
* - should be faster and will change with kernel
* - 'might' have to copy address, load and jump to it
* - lr is used since we are running in SRAM currently.
*/
ldr r1, kernel_flush
mov lr, pc
bx r1

skip_l2_inval:
/* Data memory barrier and Data sync barrier */
mov r1, #0
Expand Down Expand Up @@ -668,5 +614,7 @@ cache_pred_disable_mask:
.word 0xFFFFE7FB
control_stat:
.word CONTROL_STAT
kernel_flush:
.word v7_flush_dcache_all
ENTRY(omap34xx_cpu_suspend_sz)
.word . - omap34xx_cpu_suspend

0 comments on commit 0bd4053

Please sign in to comment.