From 85a4d2d41dc6d1c0296326204a857a9fab864a31 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 31 Jan 2013 14:40:49 -0500 Subject: [PATCH 01/22] intel_idle: support Haswell This patch enables intel_idle to run on the next-generation Intel(R) Microarchitecture code named "Haswell". Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index b2cf489ba3e1d..fa714774b9609 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -212,6 +212,38 @@ static struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = { .enter = &intel_idle }, }; +static struct cpuidle_state hsw_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ + .name = "C1-HSW", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle }, + { /* MWAIT C2 */ + .name = "C3-HSW", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, + .target_residency = 100, + .enter = &intel_idle }, + { /* MWAIT C3 */ + .name = "C6-HSW", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 400, + .enter = &intel_idle }, + { /* MWAIT C4 */ + .name = "C7s-HSW", + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, + .target_residency = 500, + .enter = &intel_idle }, +}; + static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { { /* MWAIT C0 */ }, { /* MWAIT C1 */ @@ -365,6 +397,10 @@ static const struct idle_cpu idle_cpu_ivb = { .state_table = ivb_cstates, }; +static const struct idle_cpu idle_cpu_hsw = { + .state_table = hsw_cstates, +}; + #define ICPU(model, cpu) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } @@ -382,6 +418,9 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x2d, idle_cpu_snb), ICPU(0x3a, idle_cpu_ivb), ICPU(0x3e, idle_cpu_ivb), + ICPU(0x3c, idle_cpu_hsw), + ICPU(0x3f, idle_cpu_hsw), + ICPU(0x45, idle_cpu_hsw), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); From 70b43400bc290764b49ff3497a9824604c66c409 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 8 Jan 2013 01:26:07 -0500 Subject: [PATCH 02/22] tools/power turbostat: support Haswell This patch enables turbostat to run properly on the next-generation Intel(R) Microarchitecture, code named "Haswell" (HSW). HSW supports the BCLK and counters found in SNB. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ce6d46038f742..b326878bd5d19 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1397,6 +1397,9 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model) case 0x2D: /* SNB Xeon */ case 0x3A: /* IVB */ case 0x3E: /* IVB Xeon */ + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ return 1; case 0x2E: /* Nehalem-EX Xeon - Beckton */ case 0x2F: /* Westmere-EX Xeon - Eagleton */ @@ -1488,6 +1491,9 @@ void rapl_probe(unsigned int family, unsigned int model) switch (model) { case 0x2A: case 0x3A: + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX; break; case 0x2D: @@ -1724,6 +1730,9 @@ int is_snb(unsigned int family, unsigned int model) case 0x2D: case 0x3A: /* IVB */ case 0x3E: /* IVB Xeon */ + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ return 1; } return 0; @@ -2248,7 +2257,7 @@ int main(int argc, char **argv) cmdline(argc, argv); if (verbose) - fprintf(stderr, "turbostat v3.0 November 23, 2012" + fprintf(stderr, "turbostat v3.1 January 8, 2013" " - Len Brown \n"); turbostat_init(); From 679204183472af16e8e75d2b1479459ad19bc67c Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 31 Jan 2013 15:22:15 -0500 Subject: [PATCH 03/22] tools/power turbostat: decode MSR_IA32_POWER_CTL When verbose is enabled, print the C1E-Enable bit in MSR_IA32_POWER_CTL. also delete some redundant tests on the verbose variable. Signed-off-by: Len Brown --- arch/x86/include/uapi/asm/msr-index.h | 2 ++ tools/power/x86/turbostat/turbostat.c | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 433a59fb1a741..7bdaf7c9b1e01 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -103,6 +103,8 @@ #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define MSR_IA32_POWER_CTL 0x000001fc + #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b326878bd5d19..75f64e05ec308 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -908,8 +908,7 @@ void print_verbose_header(void) get_msr(0, MSR_NHM_PLATFORM_INFO, &msr); - if (verbose) - fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); + fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); ratio = (msr >> 40) & 0xFF; fprintf(stderr, "%d * %.0f = %.0f MHz max efficiency\n", @@ -919,13 +918,16 @@ void print_verbose_header(void) fprintf(stderr, "%d * %.0f = %.0f MHz TSC frequency\n", ratio, bclk, ratio * bclk); + get_msr(0, MSR_IA32_POWER_CTL, &msr); + fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E: %sabled)\n", + msr, msr & 0x2 ? "EN" : "DIS"); + if (!do_ivt_turbo_ratio_limit) goto print_nhm_turbo_ratio_limits; get_msr(0, MSR_IVT_TURBO_RATIO_LIMIT, &msr); - if (verbose) - fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); + fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -1016,8 +1018,7 @@ void print_verbose_header(void) get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr); - if (verbose) - fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); + fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) From 137ecc779c80138723677209730738d76262e810 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Feb 2013 21:35:35 -0500 Subject: [PATCH 04/22] intel_idle: remove use and definition of MWAIT_MAX_NUM_CSTATES Cosmetic only. Replace use of MWAIT_MAX_NUM_CSTATES with CPUIDLE_STATE_MAX. They are both 8, so this patch has no functional change. The reason to change is that intel_idle will soon be able to export more than the 8 "major" states supported by MWAIT. When we hit that limit, it is important to know where the limit comes from. Signed-off-by: Len Brown --- arch/x86/include/asm/mwait.h | 1 - drivers/idle/intel_idle.c | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index bcdff997668c4..3f447320ce876 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -4,7 +4,6 @@ #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 -#define MWAIT_MAX_NUM_CSTATES 8 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index fa714774b9609..c949a6f25a83c 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -74,7 +74,7 @@ static struct cpuidle_driver intel_idle_driver = { .en_core_tk_irqen = 1, }; /* intel_idle.max_cstate=0 disables driver */ -static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; +static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int mwait_substates; @@ -123,7 +123,7 @@ static struct cpuidle_state *cpuidle_state_table; * which is also the index into the MWAIT hint array. * Thus C0 is a dummy. */ -static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { +static struct cpuidle_state nehalem_cstates[CPUIDLE_STATE_MAX] = { { /* MWAIT C0 */ }, { /* MWAIT C1 */ .name = "C1-NHM", @@ -148,7 +148,7 @@ static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { .enter = &intel_idle }, }; -static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { +static struct cpuidle_state snb_cstates[CPUIDLE_STATE_MAX] = { { /* MWAIT C0 */ }, { /* MWAIT C1 */ .name = "C1-SNB", @@ -180,7 +180,7 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { .enter = &intel_idle }, }; -static struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = { +static struct cpuidle_state ivb_cstates[CPUIDLE_STATE_MAX] = { { /* MWAIT C0 */ }, { /* MWAIT C1 */ .name = "C1-IVB", @@ -212,7 +212,7 @@ static struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = { .enter = &intel_idle }, }; -static struct cpuidle_state hsw_cstates[MWAIT_MAX_NUM_CSTATES] = { +static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = { { /* MWAIT C0 */ }, { /* MWAIT C1 */ .name = "C1-HSW", @@ -244,7 +244,7 @@ static struct cpuidle_state hsw_cstates[MWAIT_MAX_NUM_CSTATES] = { .enter = &intel_idle }, }; -static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { +static struct cpuidle_state atom_cstates[CPUIDLE_STATE_MAX] = { { /* MWAIT C0 */ }, { /* MWAIT C1 */ .name = "C1-ATM", @@ -503,7 +503,7 @@ static int intel_idle_cpuidle_driver_init(void) drv->state_count = 1; - for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { + for (cstate = 1; cstate < CPUIDLE_STATE_MAX; ++cstate) { int num_substates; if (cstate > max_cstate) { @@ -560,7 +560,7 @@ static int intel_idle_cpu_init(int cpu) dev->state_count = 1; - for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { + for (cstate = 1; cstate < CPUIDLE_STATE_MAX; ++cstate) { int num_substates; if (cstate > max_cstate) { From e022e7eb90f3edb83f9ff77825eda3d1b3a2f2e0 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Feb 2013 23:37:30 -0500 Subject: [PATCH 05/22] intel_idle: remove assumption of one C-state per MWAIT flag Remove the assumption that cstate_tables are indexed by MWAIT flag values. Each entry identifies itself via its own flags value. This change is needed to support multiple states that share the same MWAIT flags. Note that this can have an effect on what state is described by 'N' on cmdline intel_idle.max_cstate=N on some systems. intel_idle.max_cstate=0 still disables the driver intel_idle.max_cstate=1 still results in just C1(E) However, "place holders" in the sparse C-state name-space (eg. Atom) have been removed. Signed-off-by: Len Brown --- arch/x86/include/asm/mwait.h | 2 + drivers/idle/intel_idle.c | 110 +++++++++++++++++++---------------- 2 files changed, 61 insertions(+), 51 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 3f447320ce876..2f366d0ac6b46 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -4,6 +4,8 @@ #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 +#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) +#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index c949a6f25a83c..927cfb4d66f40 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -124,158 +124,161 @@ static struct cpuidle_state *cpuidle_state_table; * Thus C0 is a dummy. */ static struct cpuidle_state nehalem_cstates[CPUIDLE_STATE_MAX] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ + { .name = "C1-NHM", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 3, .target_residency = 6, .enter = &intel_idle }, - { /* MWAIT C2 */ + { .name = "C3-NHM", .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 20, .target_residency = 80, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-NHM", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 200, .target_residency = 800, .enter = &intel_idle }, + { + .enter = NULL } }; static struct cpuidle_state snb_cstates[CPUIDLE_STATE_MAX] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ + { .name = "C1-SNB", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle }, - { /* MWAIT C2 */ + { .name = "C3-SNB", .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 211, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-SNB", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 104, .target_residency = 345, .enter = &intel_idle }, - { /* MWAIT C4 */ + { .name = "C7-SNB", .desc = "MWAIT 0x30", .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 109, .target_residency = 345, .enter = &intel_idle }, + { + .enter = NULL } }; static struct cpuidle_state ivb_cstates[CPUIDLE_STATE_MAX] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ + { .name = "C1-IVB", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle }, - { /* MWAIT C2 */ + { .name = "C3-IVB", .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 59, .target_residency = 156, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-IVB", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 300, .enter = &intel_idle }, - { /* MWAIT C4 */ + { .name = "C7-IVB", .desc = "MWAIT 0x30", .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 87, .target_residency = 300, .enter = &intel_idle }, + { + .enter = NULL } }; static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ + { .name = "C1-HSW", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 2, .target_residency = 2, .enter = &intel_idle }, - { /* MWAIT C2 */ + { .name = "C3-HSW", .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, .target_residency = 100, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-HSW", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, .target_residency = 400, .enter = &intel_idle }, - { /* MWAIT C4 */ + { .name = "C7s-HSW", .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, .target_residency = 500, .enter = &intel_idle }, + { + .enter = NULL } }; static struct cpuidle_state atom_cstates[CPUIDLE_STATE_MAX] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ + { .name = "C1-ATM", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 1, .target_residency = 4, .enter = &intel_idle }, - { /* MWAIT C2 */ + { .name = "C2-ATM", .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 20, .target_residency = 80, .enter = &intel_idle }, - { /* MWAIT C3 */ }, - { /* MWAIT C4 */ + { .name = "C4-ATM", .desc = "MWAIT 0x30", .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 100, .target_residency = 400, .enter = &intel_idle }, - { /* MWAIT C5 */ }, - { /* MWAIT C6 */ + { .name = "C6-ATM", .desc = "MWAIT 0x52", .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 140, .target_residency = 560, .enter = &intel_idle }, + { + .enter = NULL } }; /** @@ -503,32 +506,31 @@ static int intel_idle_cpuidle_driver_init(void) drv->state_count = 1; - for (cstate = 1; cstate < CPUIDLE_STATE_MAX; ++cstate) { - int num_substates; + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int num_substates, mwait_hint, mwait_cstate, mwait_substate; - if (cstate > max_cstate) { + if (cpuidle_state_table[cstate].enter == NULL) + break; + + if (cstate + 1 > max_cstate) { printk(PREFIX "max_cstate %d reached\n", max_cstate); break; } + mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); + mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); + mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); + /* does the state exist in CPUID.MWAIT? */ - num_substates = (mwait_substates >> ((cstate) * 4)) + num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) & MWAIT_SUBSTATE_MASK; - if (num_substates == 0) - continue; - /* is the state not enabled? */ - if (cpuidle_state_table[cstate].enter == NULL) { - /* does the driver not know about the state? */ - if (*cpuidle_state_table[cstate].name == '\0') - pr_debug(PREFIX "unaware of model 0x%x" - " MWAIT %d please" - " contact lenb@kernel.org\n", - boot_cpu_data.x86_model, cstate); + + /* if sub-state in table is not enumerated by CPUID */ + if ((mwait_substate + 1) > num_substates) continue; - } - if ((cstate > 2) && + if (((mwait_cstate + 1) > 2) && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halts in idle" " states deeper than C2"); @@ -560,21 +562,27 @@ static int intel_idle_cpu_init(int cpu) dev->state_count = 1; - for (cstate = 1; cstate < CPUIDLE_STATE_MAX; ++cstate) { - int num_substates; + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int num_substates, mwait_hint, mwait_cstate, mwait_substate; - if (cstate > max_cstate) { + if (cpuidle_state_table[cstate].enter == NULL) + continue; + + if (cstate + 1 > max_cstate) { printk(PREFIX "max_cstate %d reached\n", max_cstate); break; } + mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); + mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); + mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); + /* does the state exist in CPUID.MWAIT? */ - num_substates = (mwait_substates >> ((cstate) * 4)) - & MWAIT_SUBSTATE_MASK; - if (num_substates == 0) - continue; - /* is the state not enabled? */ - if (cpuidle_state_table[cstate].enter == NULL) + num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) + & MWAIT_SUBSTATE_MASK; + + /* if sub-state in table is not enumerated by CPUID */ + if ((mwait_substate + 1) > num_substates) continue; dev->state_count += 1; From 32e9518005c8dd9ed668f40f98632c8186df4909 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 2 Feb 2013 01:31:56 -0500 Subject: [PATCH 06/22] intel_idle: export both C1 and C1E Here we disable HW promotion of C1 to C1E and export both C1 and C1E and distinct C-states. This allows a cpuidle governor to choose a lower latency C-state than C1E when necessary to satisfy performance and QOS constraints -- and still save power versus polling. This also corrects the erroneous latency previously reported for C1E -- it is 10usec, not 1usec. Note that if you use "intel_idle.max_cstate=N", then you must increment N by 1 to get the same behavior after this change. Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 54 +++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 927cfb4d66f40..5d66750138647 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -90,6 +90,7 @@ struct idle_cpu { * Indicate which enable bits to clear here. */ unsigned long auto_demotion_disable_flags; + bool disable_promotion_to_c1e; }; static const struct idle_cpu *icpu; @@ -131,6 +132,13 @@ static struct cpuidle_state nehalem_cstates[CPUIDLE_STATE_MAX] = { .exit_latency = 3, .target_residency = 6, .enter = &intel_idle }, + { + .name = "C1E-NHM", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, { .name = "C3-NHM", .desc = "MWAIT 0x10", @@ -154,8 +162,15 @@ static struct cpuidle_state snb_cstates[CPUIDLE_STATE_MAX] = { .name = "C1-SNB", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 1, - .target_residency = 1, + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle }, + { + .name = "C1E-SNB", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, .enter = &intel_idle }, { .name = "C3-SNB", @@ -190,6 +205,13 @@ static struct cpuidle_state ivb_cstates[CPUIDLE_STATE_MAX] = { .exit_latency = 1, .target_residency = 1, .enter = &intel_idle }, + { + .name = "C1E-IVB", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, { .name = "C3-IVB", .desc = "MWAIT 0x10", @@ -223,6 +245,13 @@ static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = { .exit_latency = 2, .target_residency = 2, .enter = &intel_idle }, + { + .name = "C1E-HSW", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, { .name = "C3-HSW", .desc = "MWAIT 0x10", @@ -250,11 +279,11 @@ static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = { static struct cpuidle_state atom_cstates[CPUIDLE_STATE_MAX] = { { - .name = "C1-ATM", + .name = "C1E-ATM", .desc = "MWAIT 0x00", .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 1, - .target_residency = 4, + .exit_latency = 10, + .target_residency = 20, .enter = &intel_idle }, { .name = "C2-ATM", @@ -377,10 +406,19 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } +static void c1e_promotion_disable(void *dummy) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits &= ~0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} static const struct idle_cpu idle_cpu_nehalem = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, + .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_atom = { @@ -394,14 +432,17 @@ static const struct idle_cpu idle_cpu_lincroft = { static const struct idle_cpu idle_cpu_snb = { .state_table = snb_cstates, + .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_ivb = { .state_table = ivb_cstates, + .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_hsw = { .state_table = hsw_cstates, + .disable_promotion_to_c1e = true, }; #define ICPU(model, cpu) \ @@ -544,6 +585,9 @@ static int intel_idle_cpuidle_driver_init(void) if (icpu->auto_demotion_disable_flags) on_each_cpu(auto_demotion_disable, NULL, 1); + if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ + on_each_cpu(c1e_promotion_disable, NULL, 1); + return 0; } From 1ed51011af7450991780f9a7fd916554be19d2a3 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 10 Feb 2013 17:19:24 -0500 Subject: [PATCH 07/22] tools/power turbostat: display SMI count by default The SMI counter is popular -- so display it by default rather than requiring an option. What the heck, we've blown the 80 column budget on many systems already... Note that the value displayed is the delta during the measurement interval. The absolute value of the counter can still be seen with the generic 32-bit MSR option, ie. -m 0x34 Signed-off-by: Len Brown --- arch/x86/include/uapi/asm/msr-index.h | 1 + tools/power/x86/turbostat/turbostat.8 | 36 ++++++++++++--------------- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++--- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 7bdaf7c9b1e01..8d013f5153bc8 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -274,6 +274,7 @@ #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_EBC_FREQUENCY_ID 0x0000002c +#define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d7dc2cfefb57..b4ddb748356cd 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -31,8 +31,6 @@ The \fB-S\fP option limits output to a 1-line System Summary for each interval. .PP The \fB-v\fP option increases verbosity. .PP -The \fB-s\fP option prints the SMI counter, equivalent to "-c 0x34" -.PP The \fB-c MSR#\fP option includes the delta of the specified 32-bit MSR counter. .PP The \fB-C MSR#\fP option includes the delta of the specified 64-bit MSR counter. @@ -186,26 +184,24 @@ This is a weighted average, where the weight is %c0. ie. it is the total number un-halted cycles elapsed per time divided by the number of CPUs. .SH SMI COUNTING EXAMPLE On Intel Nehalem and newer processors, MSR 0x34 is a System Management Mode Interrupt (SMI) counter. -Using the -m option, you can display how many SMIs have fired since reset, or if there -are SMIs during the measurement interval, you can display the delta using the -d option. +This counter is shown by default under the "SMI" column. .nf -[root@x980 ~]# turbostat -m 0x34 -cor CPU %c0 GHz TSC MSR 0x034 %c1 %c3 %c6 %pc3 %pc6 - 1.41 1.82 3.38 0x00000000 8.92 37.82 51.85 17.37 0.55 - 0 0 3.73 2.03 3.38 0x00000055 1.72 48.25 46.31 17.38 0.55 - 0 6 0.14 1.63 3.38 0x00000056 5.30 - 1 2 2.51 1.80 3.38 0x00000056 15.65 29.33 52.52 - 1 8 0.10 1.65 3.38 0x00000056 18.05 - 2 4 1.16 1.68 3.38 0x00000056 5.87 24.47 68.50 - 2 10 0.10 1.63 3.38 0x00000056 6.93 - 8 1 3.84 1.91 3.38 0x00000056 1.36 50.65 44.16 - 8 7 0.08 1.64 3.38 0x00000056 5.12 - 9 3 1.82 1.73 3.38 0x00000056 7.59 24.21 66.38 - 9 9 0.09 1.68 3.38 0x00000056 9.32 - 10 5 1.66 1.65 3.38 0x00000056 15.10 50.00 33.23 - 10 11 1.72 1.65 3.38 0x00000056 15.05 +[root@x980 ~]# turbostat +cor CPU %c0 GHz TSC SMI %c1 %c3 %c6 CTMP %pc3 %pc6 + 0.11 1.91 3.38 0 1.84 0.26 97.79 29 0.82 83.87 + 0 0 0.40 1.63 3.38 0 10.27 0.12 89.20 20 0.82 83.88 + 0 6 0.06 1.63 3.38 0 10.61 + 1 2 0.37 2.63 3.38 0 0.02 0.10 99.51 22 + 1 8 0.01 1.62 3.38 0 0.39 + 2 4 0.07 1.62 3.38 0 0.04 0.07 99.82 23 + 2 10 0.02 1.62 3.38 0 0.09 + 8 1 0.23 1.64 3.38 0 0.10 1.07 98.60 24 + 8 7 0.02 1.64 3.38 0 0.31 + 9 3 0.03 1.62 3.38 0 0.03 0.05 99.89 29 + 9 9 0.02 1.62 3.38 0 0.05 + 10 5 0.07 1.62 3.38 0 0.08 0.12 99.73 27 + 10 11 0.03 1.62 3.38 0 0.13 ^C -[root@x980 ~]# .fi .SH NOTES diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 75f64e05ec308..6f3214ed4444e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -58,6 +58,7 @@ unsigned int extra_msr_offset32; unsigned int extra_msr_offset64; unsigned int extra_delta_offset32; unsigned int extra_delta_offset64; +int do_smi; double bclk; unsigned int show_pkg; unsigned int show_core; @@ -99,6 +100,7 @@ struct thread_data { unsigned long long extra_delta64; unsigned long long extra_msr32; unsigned long long extra_delta32; + unsigned int smi_count; unsigned int cpu_id; unsigned int flags; #define CPU_IS_FIRST_THREAD_IN_CORE 0x2 @@ -248,6 +250,8 @@ void print_header(void) if (has_aperf) outp += sprintf(outp, " GHz"); outp += sprintf(outp, " TSC"); + if (do_smi) + outp += sprintf(outp, " SMI"); if (extra_delta_offset32) outp += sprintf(outp, " count 0x%03X", extra_delta_offset32); if (extra_delta_offset64) @@ -314,6 +318,8 @@ int dump_counters(struct thread_data *t, struct core_data *c, extra_msr_offset32, t->extra_msr32); fprintf(stderr, "msr0x%x: %016llX\n", extra_msr_offset64, t->extra_msr64); + if (do_smi) + fprintf(stderr, "SMI: %08X\n", t->smi_count); } if (c) { @@ -352,6 +358,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, * RAM_W: %5.2 * GHz: "GHz" 3 columns %3.2 * TSC: "TSC" 3 columns %3.2 + * SMI: "SMI" 4 columns %4d * percentage " %pc3" %6.2 * Perf Status percentage: %5.2 * "CTMP" 4 columns %4d @@ -431,6 +438,10 @@ int format_counters(struct thread_data *t, struct core_data *c, /* TSC */ outp += sprintf(outp, "%5.2f", 1.0 * t->tsc/units/interval_float); + /* SMI */ + if (do_smi) + outp += sprintf(outp, "%4d", t->smi_count); + /* delta */ if (extra_delta_offset32) outp += sprintf(outp, " %11llu", t->extra_delta32); @@ -645,6 +656,9 @@ delta_thread(struct thread_data *new, struct thread_data *old, */ old->extra_msr32 = new->extra_msr32; old->extra_msr64 = new->extra_msr64; + + if (do_smi) + old->smi_count = new->smi_count - old->smi_count; } int delta_cpu(struct thread_data *t, struct core_data *c, @@ -672,6 +686,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->mperf = 0; t->c1 = 0; + t->smi_count = 0; t->extra_delta32 = 0; t->extra_delta64 = 0; @@ -802,6 +817,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -4; } + if (do_smi) { + if (get_msr(cpu, MSR_SMI_COUNT, &msr)) + return -5; + t->smi_count = msr & 0xFFFFFFFF; + } if (extra_delta_offset32) { if (get_msr(cpu, extra_delta_offset32, &msr)) return -5; @@ -1893,6 +1913,7 @@ void check_cpuid() do_nehalem_platform_info = genuine_intel && has_invariant_tsc; do_nhm_cstates = genuine_intel; /* all Intel w/ non-stop TSC have NHM counters */ + do_smi = do_nhm_cstates; do_snb_cstates = is_snb(family, model); bclk = discover_bclk(family, model); @@ -2229,9 +2250,6 @@ void cmdline(int argc, char **argv) case 'c': sscanf(optarg, "%x", &extra_delta_offset32); break; - case 's': - extra_delta_offset32 = 0x34; /* SMI counter */ - break; case 'C': sscanf(optarg, "%x", &extra_delta_offset64); break; @@ -2258,7 +2276,7 @@ int main(int argc, char **argv) cmdline(argc, argv); if (verbose) - fprintf(stderr, "turbostat v3.1 January 8, 2013" + fprintf(stderr, "turbostat v3.2 February 11, 2013" " - Len Brown \n"); turbostat_init(); From dd8af076262cc1ff85a8d5e0c5b1a4716d19fe25 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 21:10:04 -0500 Subject: [PATCH 08/22] APM idle: register apm_cpu_idle via cpuidle Update APM to register its local idle routine with cpuidle. This allows us to stop exporting pm_idle to modules on x86. The Kconfig sub-option, APM_CPU_IDLE, now depends on on CPU_IDLE. Compile-tested only. Signed-off-by: Len Brown Reviewed-by: Daniel Lezcano Cc: Jiri Kosina --- arch/x86/Kconfig | 1 + arch/x86/kernel/apm_32.c | 57 +++++++++++++++++++++++---------------- arch/x86/kernel/process.c | 3 --- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 225543bf45a5c..1b635861401c6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1912,6 +1912,7 @@ config APM_DO_ENABLE this feature. config APM_CPU_IDLE + depends on CPU_IDLE bool "Make CPU Idle calls when idle" ---help--- Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index d65464e435034..9f4bc6a1164dd 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -232,6 +232,7 @@ #include #include #include +#include #include #include @@ -360,13 +361,35 @@ struct apm_user { * idle percentage above which bios idle calls are done */ #ifdef CONFIG_APM_CPU_IDLE -#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012 #define DEFAULT_IDLE_THRESHOLD 95 #else #define DEFAULT_IDLE_THRESHOLD 100 #endif #define DEFAULT_IDLE_PERIOD (100 / 3) +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static struct cpuidle_driver apm_idle_driver = { + .name = "apm_idle", + .owner = THIS_MODULE, + .en_core_tk_irqen = 1, + .states = { + { /* entry 0 is for polling */ }, + { /* entry 1 is for APM idle */ + .name = "APM", + .desc = "APM idle", + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 250, /* WAG */ + .target_residency = 500, /* WAG */ + .enter = &apm_cpu_idle + }, + }, + .state_count = 2, +}; + +static struct cpuidle_device apm_cpuidle_device; + /* * Local variables */ @@ -377,7 +400,6 @@ static struct { static int clock_slowed; static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; static int suspends_pending; static int standbys_pending; static int ignore_sys_suspend; @@ -884,8 +906,6 @@ static void apm_do_busy(void) #define IDLE_CALC_LIMIT (HZ * 100) #define IDLE_LEAKY_MAX 16 -static void (*original_pm_idle)(void) __read_mostly; - /** * apm_cpu_idle - cpu idling for APM capable Linux * @@ -894,7 +914,8 @@ static void (*original_pm_idle)(void) __read_mostly; * Furthermore it calls the system default idle routine. */ -static void apm_cpu_idle(void) +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ @@ -904,7 +925,6 @@ static void apm_cpu_idle(void) unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; - WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012"); recalc: if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; @@ -950,10 +970,7 @@ static void apm_cpu_idle(void) break; } } - if (original_pm_idle) - original_pm_idle(); - else - default_idle(); + default_idle(); local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) @@ -963,7 +980,7 @@ static void apm_cpu_idle(void) if (apm_idle_done) apm_do_busy(); - local_irq_enable(); + return index; } /** @@ -2381,9 +2398,9 @@ static int __init apm_init(void) if (HZ != 100) idle_period = (idle_period * HZ) / 100; if (idle_threshold < 100) { - original_pm_idle = pm_idle; - pm_idle = apm_cpu_idle; - set_pm_idle = 1; + if (!cpuidle_register_driver(&apm_idle_driver)) + if (cpuidle_register_device(&apm_cpuidle_device)) + cpuidle_unregister_driver(&apm_idle_driver); } return 0; @@ -2393,15 +2410,9 @@ static void __exit apm_exit(void) { int error; - if (set_pm_idle) { - pm_idle = original_pm_idle; - /* - * We are about to unload the current idle thread pm callback - * (pm_idle), Wait for all processors to update cached/local - * copies of pm_idle before proceeding. - */ - kick_all_cpus_sync(); - } + cpuidle_unregister_device(&apm_cpuidle_device); + cpuidle_unregister_driver(&apm_idle_driver); + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { error = apm_engage_power_management(APM_DEVICE_ALL, 0); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2ed787f15bf0c..f571a6e08710f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -272,9 +272,6 @@ EXPORT_SYMBOL(boot_option_idle_override); * Powermanagement idle function, if any.. */ void (*pm_idle)(void); -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(pm_idle); -#endif #ifndef CONFIG_SMP static inline void play_dead(void) From a476bda30baf7efa7f305793a340aae07b6e5780 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 21:45:03 -0500 Subject: [PATCH 09/22] x86 idle: rename global pm_idle to static x86_idle (pm_idle)() is being removed from linux/pm.h because Linux does not have such a cross-architecture concept. x86 uses an idle function pointer in its architecture specific code as a backup to cpuidle. So we re-name x86 use of pm_idle to x86_idle, and make it static to x86. Signed-off-by: Len Brown Cc: x86@kernel.org --- arch/x86/kernel/process.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f571a6e08710f..ceb05db59be12 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -268,10 +268,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); +static void (*x86_idle)(void); #ifndef CONFIG_SMP static inline void play_dead(void) @@ -348,7 +345,7 @@ void cpu_idle(void) rcu_idle_enter(); if (cpuidle_idle_call()) - pm_idle(); + x86_idle(); rcu_idle_exit(); start_critical_timings(); @@ -395,9 +392,9 @@ EXPORT_SYMBOL(default_idle); bool set_pm_idle_to_default(void) { - bool ret = !!pm_idle; + bool ret = !!x86_idle; - pm_idle = default_idle; + x86_idle = default_idle; return ret; } @@ -564,11 +561,10 @@ static void amd_e400_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - if (pm_idle == poll_idle && smp_num_siblings > 1) { + if (x86_idle == poll_idle && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); - } #endif - if (pm_idle) + if (x86_idle) return; if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { @@ -576,19 +572,19 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) * One CPU supports mwait => All CPUs supports mwait */ pr_info("using mwait in idle threads\n"); - pm_idle = mwait_idle; + x86_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); - pm_idle = amd_e400_idle; + x86_idle = amd_e400_idle; } else - pm_idle = default_idle; + x86_idle = default_idle; } void __init init_amd_e400_c1e_mask(void) { /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (pm_idle == amd_e400_idle) + if (x86_idle == amd_e400_idle) zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); } @@ -599,7 +595,7 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); - pm_idle = poll_idle; + x86_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { boot_option_idle_override = IDLE_FORCE_MWAIT; @@ -612,7 +608,7 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - pm_idle = default_idle; + x86_idle = default_idle; boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* From 3738fa5bbc3acaf6a807cb45f4367cbab412faee Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:52:57 -0500 Subject: [PATCH 10/22] sh idle: rename global pm_idle to static sh_idle SH idle code could use some simplification. This patch enables that by guaranteeing that "sh_idle" is local, and thus architecture specific. Signed-off-by: Len Brown Cc: linux-sh@vger.kernel.org --- arch/sh/kernel/idle.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 0c910163caa36..3d5a1b387cc0f 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -22,7 +22,7 @@ #include #include -void (*pm_idle)(void); +static void (*sh_idle)(void); static int hlt_counter; @@ -103,9 +103,9 @@ void cpu_idle(void) /* Don't trace irqs off for idle */ stop_critical_timings(); if (cpuidle_idle_call()) - pm_idle(); + sh_idle(); /* - * Sanity check to ensure that pm_idle() returns + * Sanity check to ensure that sh_idle() returns * with IRQs enabled */ WARN_ON(irqs_disabled()); @@ -123,13 +123,13 @@ void __init select_idle_routine(void) /* * If a platform has set its own idle routine, leave it alone. */ - if (pm_idle) + if (sh_idle) return; if (hlt_works()) - pm_idle = default_idle; + sh_idle = default_idle; else - pm_idle = poll_idle; + sh_idle = poll_idle; } void stop_this_cpu(void *unused) From d472ba840d622e48f5c94369263ef3af18ab64f8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 23:27:26 -0500 Subject: [PATCH 11/22] sparc idle: rename pm_idle to sparc_idle (pm_idle)() is being removed from linux/pm.h because Linux does not have such a cross-architecture concept. sparc uses an idle function pointer in its architecture specific code. So we re-name sparc use of pm_idle to sparc_idle. Signed-off-by: Len Brown Acked-by: David S. Miller Acked-by: Sam Ravnborg --- arch/sparc/include/asm/processor_32.h | 1 + arch/sparc/kernel/apc.c | 3 ++- arch/sparc/kernel/leon_pmc.c | 5 +++-- arch/sparc/kernel/pmc.c | 3 ++- arch/sparc/kernel/process_32.c | 7 +++---- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index c1e01914fd98b..2c7baa4c45050 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -118,6 +118,7 @@ extern unsigned long get_wchan(struct task_struct *); extern struct task_struct *last_task_used_math; #define cpu_relax() barrier() +extern void (*sparc_idle)(void); #endif diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c index 348fa1aeabce5..eefda32b595ef 100644 --- a/arch/sparc/kernel/apc.c +++ b/arch/sparc/kernel/apc.c @@ -20,6 +20,7 @@ #include #include #include +#include /* Debugging * @@ -158,7 +159,7 @@ static int apc_probe(struct platform_device *op) /* Assign power management IDLE handler */ if (!apc_no_idle) - pm_idle = apc_swift_idle; + sparc_idle = apc_swift_idle; printk(KERN_INFO "%s: power management initialized%s\n", APC_DEVNAME, apc_no_idle ? " (CPU idle disabled)" : ""); diff --git a/arch/sparc/kernel/leon_pmc.c b/arch/sparc/kernel/leon_pmc.c index 4e174321097d7..708bca435219f 100644 --- a/arch/sparc/kernel/leon_pmc.c +++ b/arch/sparc/kernel/leon_pmc.c @@ -9,6 +9,7 @@ #include #include #include +#include /* List of Systems that need fixup instructions around power-down instruction */ unsigned int pmc_leon_fixup_ids[] = { @@ -69,9 +70,9 @@ static int __init leon_pmc_install(void) if (sparc_cpu_model == sparc_leon) { /* Assign power management IDLE handler */ if (pmc_leon_need_fixup()) - pm_idle = pmc_leon_idle_fixup; + sparc_idle = pmc_leon_idle_fixup; else - pm_idle = pmc_leon_idle; + sparc_idle = pmc_leon_idle; printk(KERN_INFO "leon: power management initialized\n"); } diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c index dcbb62f63068b..8b7297faca79b 100644 --- a/arch/sparc/kernel/pmc.c +++ b/arch/sparc/kernel/pmc.c @@ -17,6 +17,7 @@ #include #include #include +#include /* Debug * @@ -63,7 +64,7 @@ static int pmc_probe(struct platform_device *op) #ifndef PMC_NO_IDLE /* Assign power management IDLE handler */ - pm_idle = pmc_swift_idle; + sparc_idle = pmc_swift_idle; #endif printk(KERN_INFO "%s: power management initialized\n", PMC_DEVNAME); diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index be8e862badaff..62eede13831a2 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -43,8 +43,7 @@ * Power management idle function * Set in pm platform drivers (apc.c and pmc.c) */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); +void (*sparc_idle)(void); /* * Power-off handler instantiation for pm.h compliance @@ -75,8 +74,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ for (;;) { while (!need_resched()) { - if (pm_idle) - (*pm_idle)(); + if (sparc_idle) + (*sparc_idle)(); else cpu_relax(); } From 26bab0c976c5984e313d17d54997b74641f32544 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:17:47 -0500 Subject: [PATCH 12/22] blackfin idle: delete pm_idle pm_idle is dead code on blackfin. Signed-off-by: Len Brown Cc: uclinux-dist-devel@blackfin.uclinux.org --- arch/blackfin/kernel/process.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 3e16ad9b0a99a..8061426b7df5a 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -39,12 +39,6 @@ int nr_l1stack_tasks; void *l1_stack_base; unsigned long l1_stack_len; -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void) = NULL; -EXPORT_SYMBOL(pm_idle); - void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -81,7 +75,6 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - void (*idle)(void) = pm_idle; #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(smp_processor_id())) From b0ea11497c8cbc5a66019814efa1733e47e47ab1 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:22:38 -0500 Subject: [PATCH 13/22] ARM idle: delete pm_idle pm_idle() on ARM was a synonym for default_idle(), so simply invoke default_idle() directly. Signed-off-by: Len Brown Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman --- arch/arm/kernel/process.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index c6dec5fc20aa4..047d3e40e4706 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -172,14 +172,9 @@ static void default_idle(void) local_irq_enable(); } -void (*pm_idle)(void) = default_idle; -EXPORT_SYMBOL(pm_idle); - /* - * The idle thread, has rather strange semantics for calling pm_idle, - * but this is what x86 does and we need to do the same, so that - * things like cpuidle get called in the same way. The only difference - * is that we always respect 'hlt_counter' to prevent low power idle. + * The idle thread. + * We always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { @@ -210,10 +205,10 @@ void cpu_idle(void) } else if (!need_resched()) { stop_critical_timings(); if (cpuidle_idle_call()) - pm_idle(); + default_idle(); start_critical_timings(); /* - * pm_idle functions must always + * default_idle functions must always * return with IRQs enabled. */ WARN_ON(irqs_disabled()); From dc883ca34a4e8a309d652c86daab6f1b4edd9d4b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 23:15:13 -0500 Subject: [PATCH 14/22] ARM64 idle: delete pm_idle pm_idle() on arm64 was a synonym for default_idle(), so remove it and invoke default_idle() directly. Signed-off-by: Len Brown Acked-by: Catalin Marinas --- arch/arm64/kernel/process.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index cb0956bc96edf..c7002d40a9b01 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -97,14 +97,9 @@ static void default_idle(void) local_irq_enable(); } -void (*pm_idle)(void) = default_idle; -EXPORT_SYMBOL_GPL(pm_idle); - /* - * The idle thread, has rather strange semantics for calling pm_idle, - * but this is what x86 does and we need to do the same, so that - * things like cpuidle get called in the same way. The only difference - * is that we always respect 'hlt_counter' to prevent low power idle. + * The idle thread. + * We always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { @@ -122,10 +117,10 @@ void cpu_idle(void) local_irq_disable(); if (!need_resched()) { stop_critical_timings(); - pm_idle(); + default_idle(); start_critical_timings(); /* - * pm_idle functions should always return + * default_idle functions should always return * with IRQs enabled. */ WARN_ON(irqs_disabled()); From 28f14185c818c3b6a02b4e0dc00d9aca100ce024 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:34:18 -0500 Subject: [PATCH 15/22] cris idle: delete idle and pm_idle pm_idle() and idle() served no purpose on cris -- invoke default_idle() directly. Signed-off-by: Len Brown Acked-by: Jesper Nilsson --- arch/cris/kernel/process.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 7f65be6f7f177..104ff4dd9b98d 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -54,11 +54,6 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); -/* - * The following aren't currently used. - */ -void (*pm_idle)(void); - extern void default_idle(void); void (*pm_power_off)(void); @@ -77,16 +72,12 @@ void cpu_idle (void) while (1) { rcu_idle_enter(); while (!need_resched()) { - void (*idle)(void); /* * Mark this as an RCU critical section so that * synchronize_kernel() in the unload path waits * for our completion. */ - idle = pm_idle; - if (!idle) - idle = default_idle; - idle(); + default_idle(); } rcu_idle_exit(); schedule_preempt_disabled(); From 3e7fc708eb41f6385cf5cf64a68417a4be822be8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:28:34 -0500 Subject: [PATCH 16/22] ia64 idle: delete pm_idle pm_idle() on ia64 was a synonym for default_idle(). So simply invoke default_idle() directly. Signed-off-by: Len Brown Cc: linux-ia64@vger.kernel.org --- arch/ia64/kernel/process.c | 3 --- arch/ia64/kernel/setup.c | 1 - 2 files changed, 4 deletions(-) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 31360cbbd5f89..e34f565f595ae 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -57,8 +57,6 @@ void (*ia64_mark_idle)(int); unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -void (*pm_idle) (void); -EXPORT_SYMBOL(pm_idle); void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); @@ -301,7 +299,6 @@ cpu_idle (void) if (mark_idle) (*mark_idle)(1); - idle = pm_idle; if (!idle) idle = default_idle; (*idle)(); diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index aaefd9b94f2fc..2029cc0d2fc6f 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -1051,7 +1051,6 @@ cpu_init (void) max_num_phys_stacked = num_phys_stacked; } platform_cpu_init(); - pm_idle = default_idle; } void __init From 1d8225661e2407241dbf16143180ff673668d6d7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:42:39 -0500 Subject: [PATCH 17/22] m32r idle: delete pm_idle, and other dead idle code All paths on m32r lead to cpu_relax(). So delete the dead code and simply call cpu_relax() directly. Signed-off-by: Len Brown Cc: linux-m32r@ml.linux-m32r.org --- arch/m32r/kernel/process.c | 51 ++------------------------------------ 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 765d0f57c787e..bde899e155d3a 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -44,35 +44,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return tsk->thread.lr; } -/* - * Powermanagement idle function, if any.. - */ -static void (*pm_idle)(void) = NULL; - void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); -/* - * We use this is we don't have any better - * idle routine.. - */ -static void default_idle(void) -{ - /* M32R_FIXME: Please use "cpu_sleep" mode. */ - cpu_relax(); -} - -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->work.need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle (void) -{ - /* M32R_FIXME */ - cpu_relax(); -} - /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -84,14 +58,8 @@ void cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { rcu_idle_enter(); - while (!need_resched()) { - void (*idle)(void) = pm_idle; - - if (!idle) - idle = default_idle; - - idle(); - } + while (!need_resched()) + cpu_relax(); rcu_idle_exit(); schedule_preempt_disabled(); } @@ -120,21 +88,6 @@ void machine_power_off(void) /* M32R_FIXME */ } -static int __init idle_setup (char *str) -{ - if (!strncmp(str, "poll", 4)) { - printk("using poll in idle threads.\n"); - pm_idle = poll_idle; - } else if (!strncmp(str, "sleep", 4)) { - printk("using sleep in idle threads.\n"); - pm_idle = default_idle; - } - - return 1; -} - -__setup("idle=", idle_setup); - void show_regs(struct pt_regs * regs) { printk("\n"); From def82035a81dd1f46b88cd2cd3f2f8a75b3d6b6d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:31:57 -0500 Subject: [PATCH 18/22] microblaze idle: delete pm_idle pm_idle on microblaze served no purpose. Signed-off-by: Len Brown Cc: microblaze-uclinux@itee.uq.edu.au --- arch/microblaze/kernel/process.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index a5b74f729e5ba..6ff2dcff3410a 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -41,7 +41,6 @@ void show_regs(struct pt_regs *regs) regs->msr, regs->ear, regs->esr, regs->fsr); } -void (*pm_idle)(void); void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -98,8 +97,6 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - void (*idle)(void) = pm_idle; - if (!idle) idle = default_idle; From 7626f93316aeb05575a8f46106c5507f8604fd74 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:30:27 -0500 Subject: [PATCH 19/22] mn10300 idle: delete pm_idle pm_idle on mn10300 served no purpose. Signed-off-by: Len Brown Cc: linux-am33-list@redhat.com --- arch/mn10300/kernel/process.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index eb09f5a552ff3..84f4e97e30745 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -36,12 +36,6 @@ #include #include "internal.h" -/* - * power management idle function, if any.. - */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); - /* * return saved PC of a blocked thread. */ @@ -113,7 +107,6 @@ void cpu_idle(void) void (*idle)(void); smp_rmb(); - idle = pm_idle; if (!idle) { #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) idle = poll_idle; From ed9831407bcc73abd6cb80549d5d028a8809a488 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:26:31 -0500 Subject: [PATCH 20/22] openrisc idle: delete pm_idle pm_idle() on openrisc was dead code. Signed-off-by: Len Brown Cc: linux@lists.openrisc.net --- arch/openrisc/kernel/idle.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index 7d618feb1b72e..5e8a3b6d6bc69 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -39,11 +39,6 @@ void (*powersave) (void) = NULL; -static inline void pm_idle(void) -{ - barrier(); -} - void cpu_idle(void) { set_thread_flag(TIF_POLLING_NRFLAG); From 6ae236c05d46f48ac3d002a8643cda422f6e79dd Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 22:47:22 -0500 Subject: [PATCH 21/22] unicore32 idle: delete stray pm_idle comment as pm_idle() has already been deleted from this code, the comment was a stray. Signed-off-by: Len Brown Cc: Guan Xuetao --- arch/unicore32/kernel/process.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 62bad9fed03e0..872d7e22d847e 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -45,11 +45,6 @@ static const char * const processor_modes[] = { "UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR" }; -/* - * The idle thread, has rather strange semantics for calling pm_idle, - * but this is what x86 does and we need to do the same, so that - * things like cpuidle get called in the same way. - */ void cpu_idle(void) { /* endless idle loop with no priority at all */ From 558bd3e8dc7a798c5c845f90cf038b9bbd2df2b8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 9 Feb 2013 21:51:27 -0500 Subject: [PATCH 22/22] PM idle: remove global declaration of pm_idle pm_idle appears in no generic Linux code, it appears only in architecture-specific code. Thus, pm_idle should not be declared in pm.h. Architectures that use an idle function pointer should delcare one local to their architecture, and/or use cpuidle. Signed-off-by: Len Brown Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Cc: linux-pm@vger.kernel.org --- include/linux/pm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/pm.h b/include/linux/pm.h index 03d7bb145311e..97bcf23e045a2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -31,7 +31,6 @@ /* * Callbacks for platform drivers to implement. */ -extern void (*pm_idle)(void); extern void (*pm_power_off)(void); extern void (*pm_power_off_prepare)(void);