Skip to content

Commit

Permalink
powernv/cpuidle: Redesign idle states management
Browse files Browse the repository at this point in the history
Deep idle states like sleep and winkle are per core idle states. A core
enters these states only when all the threads enter either the
particular idle state or a deeper one. There are tasks like fastsleep
hardware bug workaround and hypervisor core state save which have to be
done only by the last thread of the core entering deep idle state and
similarly tasks like timebase resync, hypervisor core register restore
that have to be done only by the first thread waking up from these
state.

The current idle state management does not have a way to distinguish the
first/last thread of the core waking/entering idle states. Tasks like
timebase resync are done for all the threads. This is not only is
suboptimal, but can cause functionality issues when subcores and kvm is
involved.

This patch adds the necessary infrastructure to track idle states of
threads in a per-core structure. It uses this info to perform tasks like
fastsleep workaround and timebase resync only once per core.

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Originally-by: Preeti U. Murthy <preeti@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: linux-pm@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
  • Loading branch information
Shreyas B. Prabhu authored and Michael Ellerman committed Dec 14, 2014
1 parent 8eb8ac8 commit 7cba160
Show file tree
Hide file tree
Showing 11 changed files with 296 additions and 58 deletions.
20 changes: 20 additions & 0 deletions arch/powerpc/include/asm/cpuidle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef _ASM_POWERPC_CPUIDLE_H
#define _ASM_POWERPC_CPUIDLE_H

#ifdef CONFIG_PPC_POWERNV
/* Used in powernv idle state management */
#define PNV_THREAD_RUNNING 0
#define PNV_THREAD_NAP 1
#define PNV_THREAD_SLEEP 2
#define PNV_THREAD_WINKLE 3
#define PNV_CORE_IDLE_LOCK_BIT 0x100
#define PNV_CORE_IDLE_THREAD_BITS 0x0FF

#ifndef __ASSEMBLY__
extern u32 pnv_fastsleep_workaround_at_entry[];
extern u32 pnv_fastsleep_workaround_at_exit[];
#endif

#endif

#endif
2 changes: 2 additions & 0 deletions arch/powerpc/include/asm/opal.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ struct opal_sg_list {
#define OPAL_PCI_ERR_INJECT 96
#define OPAL_PCI_EEH_FREEZE_SET 97
#define OPAL_HANDLE_HMI 98
#define OPAL_CONFIG_CPU_IDLE_STATE 99
#define OPAL_REGISTER_DUMP_REGION 101
#define OPAL_UNREGISTER_DUMP_REGION 102
#define OPAL_WRITE_TPO 103
Expand All @@ -175,6 +176,7 @@ struct opal_sg_list {
*/
#define OPAL_PM_NAP_ENABLED 0x00010000
#define OPAL_PM_SLEEP_ENABLED 0x00020000
#define OPAL_PM_SLEEP_ENABLED_ER1 0x00080000

#ifndef __ASSEMBLY__

Expand Down
8 changes: 8 additions & 0 deletions arch/powerpc/include/asm/paca.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ struct paca_struct {
u64 tm_scratch; /* TM scratch area for reclaim */
#endif

#ifdef CONFIG_PPC_POWERNV
/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
u32 *core_idle_state_ptr;
u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */
/* Mask to indicate thread id in core */
u8 thread_mask;
#endif

#ifdef CONFIG_PPC_BOOK3S_64
/* Exclusive emergency stack pointer for machine check exception. */
void *mc_emergency_sp;
Expand Down
2 changes: 1 addition & 1 deletion arch/powerpc/include/asm/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};

extern int powersave_nap; /* set if nap mode can be used in idle loop */
extern unsigned long power7_nap(int check_irq);
extern void power7_sleep(void);
extern unsigned long power7_sleep(void);
extern void flush_instruction_cache(void);
extern void hard_reset_now(void);
extern void poweroff_now(void);
Expand Down
9 changes: 9 additions & 0 deletions arch/powerpc/kernel/asm-offsets.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,5 +726,14 @@ int main(void)
arch.timing_last_enter.tv32.tbl));
#endif

#ifdef CONFIG_PPC_POWERNV
DEFINE(PACA_CORE_IDLE_STATE_PTR,
offsetof(struct paca_struct, core_idle_state_ptr));
DEFINE(PACA_THREAD_IDLE_STATE,
offsetof(struct paca_struct, thread_idle_state));
DEFINE(PACA_THREAD_MASK,
offsetof(struct paca_struct, thread_mask));
#endif

return 0;
}
24 changes: 15 additions & 9 deletions arch/powerpc/kernel/exceptions-64s.S
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <asm/hw_irq.h>
#include <asm/exception-64s.h>
#include <asm/ptrace.h>
#include <asm/cpuidle.h>

/*
* We layout physical memory as follows:
Expand Down Expand Up @@ -109,15 +110,19 @@ BEGIN_FTR_SECTION
rlwinm. r13,r13,47-31,30,31
beq 9f

/* waking up from powersave (nap) state */
cmpwi cr1,r13,2
/* Total loss of HV state is fatal, we could try to use the
* PIR to locate a PACA, then use an emergency stack etc...
* OPAL v3 based powernv platforms have new idle states
* which fall in this catagory.
*/
bgt cr1,8f
cmpwi cr3,r13,2

GET_PACA(r13)
lbz r0,PACA_THREAD_IDLE_STATE(r13)
cmpwi cr2,r0,PNV_THREAD_NAP
bgt cr2,8f /* Either sleep or Winkle */

/* Waking up from nap should not cause hypervisor state loss */
bgt cr3,.

/* Waking up from nap */
li r0,PNV_THREAD_RUNNING
stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */

#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
li r0,KVM_HWTHREAD_IN_KERNEL
Expand All @@ -133,7 +138,7 @@ BEGIN_FTR_SECTION

/* Return SRR1 from power7_nap() */
mfspr r3,SPRN_SRR1
beq cr1,2f
beq cr3,2f
b power7_wakeup_noloss
2: b power7_wakeup_loss

Expand Down Expand Up @@ -1382,6 +1387,7 @@ machine_check_handle_early:
MACHINE_CHECK_HANDLER_WINDUP
GET_PACA(r13)
ld r1,PACAR1(r13)
li r3,PNV_THREAD_NAP
b power7_enter_nap_mode
4:
#endif
Expand Down
197 changes: 153 additions & 44 deletions arch/powerpc/kernel/idle_power7.S
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <asm/hw_irq.h>
#include <asm/kvm_book3s_asm.h>
#include <asm/opal.h>
#include <asm/cpuidle.h>

#undef DEBUG

Expand All @@ -37,8 +38,7 @@

/*
* Pass requested state in r3:
* 0 - nap
* 1 - sleep
* r3 - PNV_THREAD_NAP/SLEEP/WINKLE
*
* To check IRQ_HAPPENED in r4
* 0 - don't check
Expand Down Expand Up @@ -123,12 +123,58 @@ power7_enter_nap_mode:
li r4,KVM_HWTHREAD_IN_NAP
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
cmpwi cr0,r3,1
beq 2f
stb r3,PACA_THREAD_IDLE_STATE(r13)
cmpwi cr1,r3,PNV_THREAD_SLEEP
bge cr1,2f
IDLE_STATE_ENTER_SEQ(PPC_NAP)
/* No return */
2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
/* No return */
2:
/* Sleep or winkle */
lbz r7,PACA_THREAD_MASK(r13)
ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
lwarx_loop1:
lwarx r15,0,r14
andc r15,r15,r7 /* Clear thread bit */

andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS

/*
* If cr0 = 0, then current thread is the last thread of the core entering
* sleep. Last thread needs to execute the hardware bug workaround code if
* required by the platform.
* Make the workaround call unconditionally here. The below branch call is
* patched out when the idle states are discovered if the platform does not
* require it.
*/
.global pnv_fastsleep_workaround_at_entry
pnv_fastsleep_workaround_at_entry:
beq fastsleep_workaround_at_entry

stwcx. r15,0,r14
bne- lwarx_loop1
isync

common_enter: /* common code for all the threads entering sleep */
IDLE_STATE_ENTER_SEQ(PPC_SLEEP)

fastsleep_workaround_at_entry:
ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
stwcx. r15,0,r14
bne- lwarx_loop1
isync

/* Fast sleep workaround */
li r3,1
li r4,1
li r0,OPAL_CONFIG_CPU_IDLE_STATE
bl opal_call_realmode

/* Clear Lock bit */
li r0,0
lwsync
stw r0,0(r14)
b common_enter


_GLOBAL(power7_idle)
/* Now check if user or arch enabled NAP mode */
Expand All @@ -141,49 +187,16 @@ _GLOBAL(power7_idle)

_GLOBAL(power7_nap)
mr r4,r3
li r3,0
li r3,PNV_THREAD_NAP
b power7_powersave_common
/* No return */

_GLOBAL(power7_sleep)
li r3,1
li r3,PNV_THREAD_SLEEP
li r4,1
b power7_powersave_common
/* No return */

/*
* Make opal call in realmode. This is a generic function to be called
* from realmode from reset vector. It handles endianess.
*
* r13 - paca pointer
* r1 - stack pointer
* r3 - opal token
*/
opal_call_realmode:
mflr r12
std r12,_LINK(r1)
ld r2,PACATOC(r13)
/* Set opal return address */
LOAD_REG_ADDR(r0,return_from_opal_call)
mtlr r0
/* Handle endian-ness */
li r0,MSR_LE
mfmsr r12
andc r12,r12,r0
mtspr SPRN_HSRR1,r12
mr r0,r3 /* Move opal token to r0 */
LOAD_REG_ADDR(r11,opal)
ld r12,8(r11)
ld r2,0(r11)
mtspr SPRN_HSRR0,r12
hrfid

return_from_opal_call:
FIXUP_ENDIAN
ld r0,_LINK(r1)
mtlr r0
blr

#define CHECK_HMI_INTERRUPT \
mfspr r0,SPRN_SRR1; \
BEGIN_FTR_SECTION_NESTED(66); \
Expand All @@ -197,7 +210,7 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
ld r2,PACATOC(r13); \
ld r1,PACAR1(r13); \
std r3,ORIG_GPR3(r1); /* Save original r3 */ \
li r3,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \
li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \
bl opal_call_realmode; \
ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \
20: nop;
Expand All @@ -206,16 +219,105 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
_GLOBAL(power7_wakeup_tb_loss)
ld r2,PACATOC(r13);
ld r1,PACAR1(r13)
/*
* Before entering any idle state, the NVGPRs are saved in the stack
* and they are restored before switching to the process context. Hence
* until they are restored, they are free to be used.
*
* Save SRR1 in a NVGPR as it might be clobbered in opal_call_realmode
* (called in CHECK_HMI_INTERRUPT). SRR1 is required to determine the
* wakeup reason if we branch to kvm_start_guest.
*/

mfspr r16,SPRN_SRR1
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)

lbz r7,PACA_THREAD_MASK(r13)
ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
lwarx_loop2:
lwarx r15,0,r14
andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
/*
* Lock bit is set in one of the 2 cases-
* a. In the sleep/winkle enter path, the last thread is executing
* fastsleep workaround code.
* b. In the wake up path, another thread is executing fastsleep
* workaround undo code or resyncing timebase or restoring context
* In either case loop until the lock bit is cleared.
*/
bne core_idle_lock_held

cmpwi cr2,r15,0
or r15,r15,r7 /* Set thread bit */

beq cr2,first_thread

/* Not first thread in core to wake up */
stwcx. r15,0,r14
bne- lwarx_loop2
isync
b common_exit

core_idle_lock_held:
HMT_LOW
core_idle_lock_loop:
lwz r15,0(14)
andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
bne core_idle_lock_loop
HMT_MEDIUM
b lwarx_loop2

first_thread:
/* First thread in core to wakeup */
ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
stwcx. r15,0,r14
bne- lwarx_loop2
isync

/*
* First thread in the core waking up from fastsleep. It needs to
* call the fastsleep workaround code if the platform requires it.
* Call it unconditionally here. The below branch instruction will
* be patched out when the idle states are discovered if platform
* does not require workaround.
*/
.global pnv_fastsleep_workaround_at_exit
pnv_fastsleep_workaround_at_exit:
b fastsleep_workaround_at_exit

timebase_resync:
/* Do timebase resync if we are waking up from sleep. Use cr3 value
* set in exceptions-64s.S */
ble cr3,clear_lock
/* Time base re-sync */
li r3,OPAL_RESYNC_TIMEBASE
li r0,OPAL_RESYNC_TIMEBASE
bl opal_call_realmode;

/* TODO: Check r3 for failure */

clear_lock:
andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS
lwsync
stw r15,0(r14)

common_exit:
li r5,PNV_THREAD_RUNNING
stb r5,PACA_THREAD_IDLE_STATE(r13)

mtspr SPRN_SRR1,r16
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
li r0,KVM_HWTHREAD_IN_KERNEL
stb r0,HSTATE_HWTHREAD_STATE(r13)
/* Order setting hwthread_state vs. testing hwthread_req */
sync
lbz r0,HSTATE_HWTHREAD_REQ(r13)
cmpwi r0,0
beq 6f
b kvm_start_guest
6:
#endif

REST_NVGPRS(r1)
REST_GPR(2, r1)
ld r3,_CCR(r1)
Expand All @@ -228,6 +330,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
mtspr SPRN_SRR0,r5
rfid

fastsleep_workaround_at_exit:
li r3,1
li r4,0
li r0,OPAL_CONFIG_CPU_IDLE_STATE
bl opal_call_realmode
b timebase_resync

/*
* R3 here contains the value that will be returned to the caller
* of power7_nap.
Expand Down
Loading

0 comments on commit 7cba160

Please sign in to comment.