diff --git a/[refs] b/[refs] index 0ec8014a4fa6..b9d52aecd8e7 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 2e7d0f60d8b976c951621c1bc82acf0654089c0b +refs/heads/master: 6a377ddc4e4ede2eeb9cd46ada23bbe417704fc9 diff --git a/trunk/arch/arm/kernel/process.c b/trunk/arch/arm/kernel/process.c index 047d3e40e470..c6dec5fc20aa 100644 --- a/trunk/arch/arm/kernel/process.c +++ b/trunk/arch/arm/kernel/process.c @@ -172,9 +172,14 @@ static void default_idle(void) local_irq_enable(); } +void (*pm_idle)(void) = default_idle; +EXPORT_SYMBOL(pm_idle); + /* - * The idle thread. - * We always respect 'hlt_counter' to prevent low power idle. + * The idle thread, has rather strange semantics for calling pm_idle, + * but this is what x86 does and we need to do the same, so that + * things like cpuidle get called in the same way. The only difference + * is that we always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { @@ -205,10 +210,10 @@ void cpu_idle(void) } else if (!need_resched()) { stop_critical_timings(); if (cpuidle_idle_call()) - default_idle(); + pm_idle(); start_critical_timings(); /* - * default_idle functions must always + * pm_idle functions must always * return with IRQs enabled. */ WARN_ON(irqs_disabled()); diff --git a/trunk/arch/arm/mach-davinci/cpuidle.c b/trunk/arch/arm/mach-davinci/cpuidle.c index 5ac9e9384b15..9107691adbdb 100644 --- a/trunk/arch/arm/mach-davinci/cpuidle.c +++ b/trunk/arch/arm/mach-davinci/cpuidle.c @@ -25,44 +25,35 @@ #define DAVINCI_CPUIDLE_MAX_STATES 2 -static DEFINE_PER_CPU(struct cpuidle_device, davinci_cpuidle_device); -static void __iomem *ddr2_reg_base; -static bool ddr2_pdown; - -static void davinci_save_ddr_power(int enter, bool pdown) -{ - u32 val; - - val = __raw_readl(ddr2_reg_base + DDR2_SDRCR_OFFSET); - - if (enter) { - if (pdown) - val |= DDR2_SRPD_BIT; - else - val &= ~DDR2_SRPD_BIT; - val |= DDR2_LPMODEN_BIT; - } else { - val &= ~(DDR2_SRPD_BIT | DDR2_LPMODEN_BIT); - } - - __raw_writel(val, ddr2_reg_base + DDR2_SDRCR_OFFSET); -} +struct davinci_ops { + void (*enter) (u32 flags); + void (*exit) (u32 flags); + u32 flags; +}; /* Actual code that puts the SoC in different idle states */ static int davinci_enter_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - davinci_save_ddr_power(1, ddr2_pdown); + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + struct davinci_ops *ops = cpuidle_get_statedata(state_usage); + + if (ops && ops->enter) + ops->enter(ops->flags); index = cpuidle_wrap_enter(dev, drv, index, arm_cpuidle_simple_enter); - davinci_save_ddr_power(0, ddr2_pdown); + if (ops && ops->exit) + ops->exit(ops->flags); return index; } +/* fields in davinci_ops.flags */ +#define DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN BIT(0) + static struct cpuidle_driver davinci_idle_driver = { .name = "cpuidle-davinci", .owner = THIS_MODULE, @@ -79,6 +70,45 @@ static struct cpuidle_driver davinci_idle_driver = { .state_count = DAVINCI_CPUIDLE_MAX_STATES, }; +static DEFINE_PER_CPU(struct cpuidle_device, davinci_cpuidle_device); +static void __iomem *ddr2_reg_base; + +static void davinci_save_ddr_power(int enter, bool pdown) +{ + u32 val; + + val = __raw_readl(ddr2_reg_base + DDR2_SDRCR_OFFSET); + + if (enter) { + if (pdown) + val |= DDR2_SRPD_BIT; + else + val &= ~DDR2_SRPD_BIT; + val |= DDR2_LPMODEN_BIT; + } else { + val &= ~(DDR2_SRPD_BIT | DDR2_LPMODEN_BIT); + } + + __raw_writel(val, ddr2_reg_base + DDR2_SDRCR_OFFSET); +} + +static void davinci_c2state_enter(u32 flags) +{ + davinci_save_ddr_power(1, !!(flags & DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN)); +} + +static void davinci_c2state_exit(u32 flags) +{ + davinci_save_ddr_power(0, !!(flags & DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN)); +} + +static struct davinci_ops davinci_states[DAVINCI_CPUIDLE_MAX_STATES] = { + [1] = { + .enter = davinci_c2state_enter, + .exit = davinci_c2state_exit, + }, +}; + static int __init davinci_cpuidle_probe(struct platform_device *pdev) { int ret; @@ -94,7 +124,11 @@ static int __init davinci_cpuidle_probe(struct platform_device *pdev) ddr2_reg_base = pdata->ddr2_ctlr_base; - ddr2_pdown = pdata->ddr2_pdown; + if (pdata->ddr2_pdown) + davinci_states[1].flags |= DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN; + cpuidle_set_statedata(&device->states_usage[1], &davinci_states[1]); + + device->state_count = DAVINCI_CPUIDLE_MAX_STATES; ret = cpuidle_register_driver(&davinci_idle_driver); if (ret) { diff --git a/trunk/arch/arm64/kernel/process.c b/trunk/arch/arm64/kernel/process.c index c7002d40a9b0..cb0956bc96ed 100644 --- a/trunk/arch/arm64/kernel/process.c +++ b/trunk/arch/arm64/kernel/process.c @@ -97,9 +97,14 @@ static void default_idle(void) local_irq_enable(); } +void (*pm_idle)(void) = default_idle; +EXPORT_SYMBOL_GPL(pm_idle); + /* - * The idle thread. - * We always respect 'hlt_counter' to prevent low power idle. + * The idle thread, has rather strange semantics for calling pm_idle, + * but this is what x86 does and we need to do the same, so that + * things like cpuidle get called in the same way. The only difference + * is that we always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { @@ -117,10 +122,10 @@ void cpu_idle(void) local_irq_disable(); if (!need_resched()) { stop_critical_timings(); - default_idle(); + pm_idle(); start_critical_timings(); /* - * default_idle functions should always return + * pm_idle functions should always return * with IRQs enabled. */ WARN_ON(irqs_disabled()); diff --git a/trunk/arch/blackfin/kernel/process.c b/trunk/arch/blackfin/kernel/process.c index 8061426b7df5..3e16ad9b0a99 100644 --- a/trunk/arch/blackfin/kernel/process.c +++ b/trunk/arch/blackfin/kernel/process.c @@ -39,6 +39,12 @@ int nr_l1stack_tasks; void *l1_stack_base; unsigned long l1_stack_len; +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void) = NULL; +EXPORT_SYMBOL(pm_idle); + void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -75,6 +81,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { + void (*idle)(void) = pm_idle; #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(smp_processor_id())) diff --git a/trunk/arch/cris/kernel/process.c b/trunk/arch/cris/kernel/process.c index 104ff4dd9b98..7f65be6f7f17 100644 --- a/trunk/arch/cris/kernel/process.c +++ b/trunk/arch/cris/kernel/process.c @@ -54,6 +54,11 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); +/* + * The following aren't currently used. + */ +void (*pm_idle)(void); + extern void default_idle(void); void (*pm_power_off)(void); @@ -72,12 +77,16 @@ void cpu_idle (void) while (1) { rcu_idle_enter(); while (!need_resched()) { + void (*idle)(void); /* * Mark this as an RCU critical section so that * synchronize_kernel() in the unload path waits * for our completion. */ - default_idle(); + idle = pm_idle; + if (!idle) + idle = default_idle; + idle(); } rcu_idle_exit(); schedule_preempt_disabled(); diff --git a/trunk/arch/ia64/kernel/process.c b/trunk/arch/ia64/kernel/process.c index e34f565f595a..31360cbbd5f8 100644 --- a/trunk/arch/ia64/kernel/process.c +++ b/trunk/arch/ia64/kernel/process.c @@ -57,6 +57,8 @@ void (*ia64_mark_idle)(int); unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); +void (*pm_idle) (void); +EXPORT_SYMBOL(pm_idle); void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); @@ -299,6 +301,7 @@ cpu_idle (void) if (mark_idle) (*mark_idle)(1); + idle = pm_idle; if (!idle) idle = default_idle; (*idle)(); diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c index 2029cc0d2fc6..aaefd9b94f2f 100644 --- a/trunk/arch/ia64/kernel/setup.c +++ b/trunk/arch/ia64/kernel/setup.c @@ -1051,6 +1051,7 @@ cpu_init (void) max_num_phys_stacked = num_phys_stacked; } platform_cpu_init(); + pm_idle = default_idle; } void __init diff --git a/trunk/arch/m32r/kernel/process.c b/trunk/arch/m32r/kernel/process.c index bde899e155d3..765d0f57c787 100644 --- a/trunk/arch/m32r/kernel/process.c +++ b/trunk/arch/m32r/kernel/process.c @@ -44,9 +44,35 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return tsk->thread.lr; } +/* + * Powermanagement idle function, if any.. + */ +static void (*pm_idle)(void) = NULL; + void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); +/* + * We use this is we don't have any better + * idle routine.. + */ +static void default_idle(void) +{ + /* M32R_FIXME: Please use "cpu_sleep" mode. */ + cpu_relax(); +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + /* M32R_FIXME */ + cpu_relax(); +} + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -58,8 +84,14 @@ void cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { rcu_idle_enter(); - while (!need_resched()) - cpu_relax(); + while (!need_resched()) { + void (*idle)(void) = pm_idle; + + if (!idle) + idle = default_idle; + + idle(); + } rcu_idle_exit(); schedule_preempt_disabled(); } @@ -88,6 +120,21 @@ void machine_power_off(void) /* M32R_FIXME */ } +static int __init idle_setup (char *str) +{ + if (!strncmp(str, "poll", 4)) { + printk("using poll in idle threads.\n"); + pm_idle = poll_idle; + } else if (!strncmp(str, "sleep", 4)) { + printk("using sleep in idle threads.\n"); + pm_idle = default_idle; + } + + return 1; +} + +__setup("idle=", idle_setup); + void show_regs(struct pt_regs * regs) { printk("\n"); diff --git a/trunk/arch/microblaze/kernel/process.c b/trunk/arch/microblaze/kernel/process.c index 6ff2dcff3410..a5b74f729e5b 100644 --- a/trunk/arch/microblaze/kernel/process.c +++ b/trunk/arch/microblaze/kernel/process.c @@ -41,6 +41,7 @@ void show_regs(struct pt_regs *regs) regs->msr, regs->ear, regs->esr, regs->fsr); } +void (*pm_idle)(void); void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -97,6 +98,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { + void (*idle)(void) = pm_idle; + if (!idle) idle = default_idle; diff --git a/trunk/arch/mn10300/kernel/process.c b/trunk/arch/mn10300/kernel/process.c index 84f4e97e3074..eb09f5a552ff 100644 --- a/trunk/arch/mn10300/kernel/process.c +++ b/trunk/arch/mn10300/kernel/process.c @@ -36,6 +36,12 @@ #include #include "internal.h" +/* + * power management idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); + /* * return saved PC of a blocked thread. */ @@ -107,6 +113,7 @@ void cpu_idle(void) void (*idle)(void); smp_rmb(); + idle = pm_idle; if (!idle) { #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) idle = poll_idle; diff --git a/trunk/arch/openrisc/kernel/idle.c b/trunk/arch/openrisc/kernel/idle.c index 5e8a3b6d6bc6..7d618feb1b72 100644 --- a/trunk/arch/openrisc/kernel/idle.c +++ b/trunk/arch/openrisc/kernel/idle.c @@ -39,6 +39,11 @@ void (*powersave) (void) = NULL; +static inline void pm_idle(void) +{ + barrier(); +} + void cpu_idle(void) { set_thread_flag(TIF_POLLING_NRFLAG); diff --git a/trunk/arch/sh/kernel/idle.c b/trunk/arch/sh/kernel/idle.c index 3d5a1b387cc0..0c910163caa3 100644 --- a/trunk/arch/sh/kernel/idle.c +++ b/trunk/arch/sh/kernel/idle.c @@ -22,7 +22,7 @@ #include #include -static void (*sh_idle)(void); +void (*pm_idle)(void); static int hlt_counter; @@ -103,9 +103,9 @@ void cpu_idle(void) /* Don't trace irqs off for idle */ stop_critical_timings(); if (cpuidle_idle_call()) - sh_idle(); + pm_idle(); /* - * Sanity check to ensure that sh_idle() returns + * Sanity check to ensure that pm_idle() returns * with IRQs enabled */ WARN_ON(irqs_disabled()); @@ -123,13 +123,13 @@ void __init select_idle_routine(void) /* * If a platform has set its own idle routine, leave it alone. */ - if (sh_idle) + if (pm_idle) return; if (hlt_works()) - sh_idle = default_idle; + pm_idle = default_idle; else - sh_idle = poll_idle; + pm_idle = poll_idle; } void stop_this_cpu(void *unused) diff --git a/trunk/arch/sparc/include/asm/processor_32.h b/trunk/arch/sparc/include/asm/processor_32.h index 2c7baa4c4505..c1e01914fd98 100644 --- a/trunk/arch/sparc/include/asm/processor_32.h +++ b/trunk/arch/sparc/include/asm/processor_32.h @@ -118,7 +118,6 @@ extern unsigned long get_wchan(struct task_struct *); extern struct task_struct *last_task_used_math; #define cpu_relax() barrier() -extern void (*sparc_idle)(void); #endif diff --git a/trunk/arch/sparc/kernel/apc.c b/trunk/arch/sparc/kernel/apc.c index eefda32b595e..348fa1aeabce 100644 --- a/trunk/arch/sparc/kernel/apc.c +++ b/trunk/arch/sparc/kernel/apc.c @@ -20,7 +20,6 @@ #include #include #include -#include /* Debugging * @@ -159,7 +158,7 @@ static int apc_probe(struct platform_device *op) /* Assign power management IDLE handler */ if (!apc_no_idle) - sparc_idle = apc_swift_idle; + pm_idle = apc_swift_idle; printk(KERN_INFO "%s: power management initialized%s\n", APC_DEVNAME, apc_no_idle ? " (CPU idle disabled)" : ""); diff --git a/trunk/arch/sparc/kernel/leon_pmc.c b/trunk/arch/sparc/kernel/leon_pmc.c index 708bca435219..4e174321097d 100644 --- a/trunk/arch/sparc/kernel/leon_pmc.c +++ b/trunk/arch/sparc/kernel/leon_pmc.c @@ -9,7 +9,6 @@ #include #include #include -#include /* List of Systems that need fixup instructions around power-down instruction */ unsigned int pmc_leon_fixup_ids[] = { @@ -70,9 +69,9 @@ static int __init leon_pmc_install(void) if (sparc_cpu_model == sparc_leon) { /* Assign power management IDLE handler */ if (pmc_leon_need_fixup()) - sparc_idle = pmc_leon_idle_fixup; + pm_idle = pmc_leon_idle_fixup; else - sparc_idle = pmc_leon_idle; + pm_idle = pmc_leon_idle; printk(KERN_INFO "leon: power management initialized\n"); } diff --git a/trunk/arch/sparc/kernel/pmc.c b/trunk/arch/sparc/kernel/pmc.c index 8b7297faca79..dcbb62f63068 100644 --- a/trunk/arch/sparc/kernel/pmc.c +++ b/trunk/arch/sparc/kernel/pmc.c @@ -17,7 +17,6 @@ #include #include #include -#include /* Debug * @@ -64,7 +63,7 @@ static int pmc_probe(struct platform_device *op) #ifndef PMC_NO_IDLE /* Assign power management IDLE handler */ - sparc_idle = pmc_swift_idle; + pm_idle = pmc_swift_idle; #endif printk(KERN_INFO "%s: power management initialized\n", PMC_DEVNAME); diff --git a/trunk/arch/sparc/kernel/process_32.c b/trunk/arch/sparc/kernel/process_32.c index 62eede13831a..be8e862badaf 100644 --- a/trunk/arch/sparc/kernel/process_32.c +++ b/trunk/arch/sparc/kernel/process_32.c @@ -43,7 +43,8 @@ * Power management idle function * Set in pm platform drivers (apc.c and pmc.c) */ -void (*sparc_idle)(void); +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); /* * Power-off handler instantiation for pm.h compliance @@ -74,8 +75,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ for (;;) { while (!need_resched()) { - if (sparc_idle) - (*sparc_idle)(); + if (pm_idle) + (*pm_idle)(); else cpu_relax(); } diff --git a/trunk/arch/unicore32/kernel/process.c b/trunk/arch/unicore32/kernel/process.c index 872d7e22d847..62bad9fed03e 100644 --- a/trunk/arch/unicore32/kernel/process.c +++ b/trunk/arch/unicore32/kernel/process.c @@ -45,6 +45,11 @@ static const char * const processor_modes[] = { "UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR" }; +/* + * The idle thread, has rather strange semantics for calling pm_idle, + * but this is what x86 does and we need to do the same, so that + * things like cpuidle get called in the same way. + */ void cpu_idle(void) { /* endless idle loop with no priority at all */ diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig index 1b635861401c..225543bf45a5 100644 --- a/trunk/arch/x86/Kconfig +++ b/trunk/arch/x86/Kconfig @@ -1912,7 +1912,6 @@ config APM_DO_ENABLE this feature. config APM_CPU_IDLE - depends on CPU_IDLE bool "Make CPU Idle calls when idle" ---help--- Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. diff --git a/trunk/arch/x86/include/asm/mwait.h b/trunk/arch/x86/include/asm/mwait.h index 2f366d0ac6b4..bcdff997668c 100644 --- a/trunk/arch/x86/include/asm/mwait.h +++ b/trunk/arch/x86/include/asm/mwait.h @@ -4,8 +4,7 @@ #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 -#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) -#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) +#define MWAIT_MAX_NUM_CSTATES 8 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/trunk/arch/x86/include/asm/processor.h b/trunk/arch/x86/include/asm/processor.h index 888184b2fc85..c2f7f472275e 100644 --- a/trunk/arch/x86/include/asm/processor.h +++ b/trunk/arch/x86/include/asm/processor.h @@ -998,7 +998,11 @@ extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); -bool set_pm_idle_to_default(void); +#ifdef CONFIG_XEN +bool xen_set_default_idle(void); +#else +#define xen_set_default_idle 0 +#endif void stop_this_cpu(void *dummy); diff --git a/trunk/arch/x86/include/uapi/asm/msr-index.h b/trunk/arch/x86/include/uapi/asm/msr-index.h index 8d013f5153bc..433a59fb1a74 100644 --- a/trunk/arch/x86/include/uapi/asm/msr-index.h +++ b/trunk/arch/x86/include/uapi/asm/msr-index.h @@ -103,8 +103,6 @@ #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) -#define MSR_IA32_POWER_CTL 0x000001fc - #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 @@ -274,7 +272,6 @@ #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_EBC_FREQUENCY_ID 0x0000002c -#define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b diff --git a/trunk/arch/x86/kernel/apm_32.c b/trunk/arch/x86/kernel/apm_32.c index 9f4bc6a1164d..d65464e43503 100644 --- a/trunk/arch/x86/kernel/apm_32.c +++ b/trunk/arch/x86/kernel/apm_32.c @@ -232,7 +232,6 @@ #include #include #include -#include #include #include @@ -361,35 +360,13 @@ struct apm_user { * idle percentage above which bios idle calls are done */ #ifdef CONFIG_APM_CPU_IDLE +#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012 #define DEFAULT_IDLE_THRESHOLD 95 #else #define DEFAULT_IDLE_THRESHOLD 100 #endif #define DEFAULT_IDLE_PERIOD (100 / 3) -static int apm_cpu_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index); - -static struct cpuidle_driver apm_idle_driver = { - .name = "apm_idle", - .owner = THIS_MODULE, - .en_core_tk_irqen = 1, - .states = { - { /* entry 0 is for polling */ }, - { /* entry 1 is for APM idle */ - .name = "APM", - .desc = "APM idle", - .flags = CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 250, /* WAG */ - .target_residency = 500, /* WAG */ - .enter = &apm_cpu_idle - }, - }, - .state_count = 2, -}; - -static struct cpuidle_device apm_cpuidle_device; - /* * Local variables */ @@ -400,6 +377,7 @@ static struct { static int clock_slowed; static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int set_pm_idle; static int suspends_pending; static int standbys_pending; static int ignore_sys_suspend; @@ -906,6 +884,8 @@ static void apm_do_busy(void) #define IDLE_CALC_LIMIT (HZ * 100) #define IDLE_LEAKY_MAX 16 +static void (*original_pm_idle)(void) __read_mostly; + /** * apm_cpu_idle - cpu idling for APM capable Linux * @@ -914,8 +894,7 @@ static void apm_do_busy(void) * Furthermore it calls the system default idle routine. */ -static int apm_cpu_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static void apm_cpu_idle(void) { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ @@ -925,6 +904,7 @@ static int apm_cpu_idle(struct cpuidle_device *dev, unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; + WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012"); recalc: if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; @@ -970,7 +950,10 @@ static int apm_cpu_idle(struct cpuidle_device *dev, break; } } - default_idle(); + if (original_pm_idle) + original_pm_idle(); + else + default_idle(); local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) @@ -980,7 +963,7 @@ static int apm_cpu_idle(struct cpuidle_device *dev, if (apm_idle_done) apm_do_busy(); - return index; + local_irq_enable(); } /** @@ -2398,9 +2381,9 @@ static int __init apm_init(void) if (HZ != 100) idle_period = (idle_period * HZ) / 100; if (idle_threshold < 100) { - if (!cpuidle_register_driver(&apm_idle_driver)) - if (cpuidle_register_device(&apm_cpuidle_device)) - cpuidle_unregister_driver(&apm_idle_driver); + original_pm_idle = pm_idle; + pm_idle = apm_cpu_idle; + set_pm_idle = 1; } return 0; @@ -2410,9 +2393,15 @@ static void __exit apm_exit(void) { int error; - cpuidle_unregister_device(&apm_cpuidle_device); - cpuidle_unregister_driver(&apm_idle_driver); - + if (set_pm_idle) { + pm_idle = original_pm_idle; + /* + * We are about to unload the current idle thread pm callback + * (pm_idle), Wait for all processors to update cached/local + * copies of pm_idle before proceeding. + */ + kick_all_cpus_sync(); + } if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { error = apm_engage_power_management(APM_DEVICE_ALL, 0); diff --git a/trunk/arch/x86/kernel/process.c b/trunk/arch/x86/kernel/process.c index ceb05db59be1..7ed9f6b08ba0 100644 --- a/trunk/arch/x86/kernel/process.c +++ b/trunk/arch/x86/kernel/process.c @@ -268,7 +268,13 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -static void (*x86_idle)(void); +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(pm_idle); +#endif #ifndef CONFIG_SMP static inline void play_dead(void) @@ -345,7 +351,7 @@ void cpu_idle(void) rcu_idle_enter(); if (cpuidle_idle_call()) - x86_idle(); + pm_idle(); rcu_idle_exit(); start_critical_timings(); @@ -390,14 +396,16 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif -bool set_pm_idle_to_default(void) +#ifdef CONFIG_XEN +bool xen_set_default_idle(void) { - bool ret = !!x86_idle; + bool ret = !!pm_idle; - x86_idle = default_idle; + pm_idle = default_idle; return ret; } +#endif void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -561,10 +569,11 @@ static void amd_e400_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - if (x86_idle == poll_idle && smp_num_siblings > 1) + if (pm_idle == poll_idle && smp_num_siblings > 1) { pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); + } #endif - if (x86_idle) + if (pm_idle) return; if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { @@ -572,19 +581,19 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) * One CPU supports mwait => All CPUs supports mwait */ pr_info("using mwait in idle threads\n"); - x86_idle = mwait_idle; + pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); - x86_idle = amd_e400_idle; + pm_idle = amd_e400_idle; } else - x86_idle = default_idle; + pm_idle = default_idle; } void __init init_amd_e400_c1e_mask(void) { /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (x86_idle == amd_e400_idle) + if (pm_idle == amd_e400_idle) zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); } @@ -595,7 +604,7 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); - x86_idle = poll_idle; + pm_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { boot_option_idle_override = IDLE_FORCE_MWAIT; @@ -608,7 +617,7 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - x86_idle = default_idle; + pm_idle = default_idle; boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* diff --git a/trunk/arch/x86/xen/setup.c b/trunk/arch/x86/xen/setup.c index 8971a26d21ab..2b73b5c8555f 100644 --- a/trunk/arch/x86/xen/setup.c +++ b/trunk/arch/x86/xen/setup.c @@ -561,7 +561,7 @@ void __init xen_arch_setup(void) #endif disable_cpuidle(); disable_cpufreq(); - WARN_ON(set_pm_idle_to_default()); + WARN_ON(xen_set_default_idle()); fiddle_vdso(); #ifdef CONFIG_NUMA numa_off = 1; diff --git a/trunk/drivers/acpi/processor_idle.c b/trunk/drivers/acpi/processor_idle.c index 8b433cb08a33..ed9a1cc690be 100644 --- a/trunk/drivers/acpi/processor_idle.c +++ b/trunk/drivers/acpi/processor_idle.c @@ -28,12 +28,19 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +#include #include +#include +#include +#include #include #include -#include /* need_resched() */ +#include +#include /* need_resched() */ +#include #include #include +#include /* * Include the apic definitions for x86 to have the APIC timer related defines @@ -45,14 +52,22 @@ #include #endif +#include +#include + #include #include +#include #define PREFIX "ACPI: " #define ACPI_PROCESSOR_CLASS "processor" #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_idle"); +#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) +#define C2_OVERHEAD 1 /* 1us */ +#define C3_OVERHEAD 1 /* 1us */ +#define PM_TIMER_TICKS_TO_US(p) (((p) * 1000)/(PM_TIMER_FREQUENCY/1000)) static unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER; module_param(max_cstate, uint, 0000); @@ -66,8 +81,6 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -static struct acpi_processor_cx *acpi_cstate[CPUIDLE_STATE_MAX]; - static int disabled_by_idle_boot_param(void) { return boot_option_idle_override == IDLE_POLL || @@ -723,7 +736,8 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; - struct acpi_processor_cx *cx = acpi_cstate[index]; + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); pr = __this_cpu_read(processors); @@ -746,7 +760,8 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, */ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index) { - struct acpi_processor_cx *cx = acpi_cstate[index]; + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); ACPI_FLUSH_CPU_CACHE(); @@ -776,7 +791,8 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; - struct acpi_processor_cx *cx = acpi_cstate[index]; + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); pr = __this_cpu_read(processors); @@ -834,7 +850,8 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; - struct acpi_processor_cx *cx = acpi_cstate[index]; + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); pr = __this_cpu_read(processors); @@ -926,13 +943,13 @@ struct cpuidle_driver acpi_idle_driver = { * device i.e. per-cpu data * * @pr: the ACPI processor - * @dev : the cpuidle device */ -static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr, - struct cpuidle_device *dev) +static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr) { int i, count = CPUIDLE_DRIVER_STATE_START; struct acpi_processor_cx *cx; + struct cpuidle_state_usage *state_usage; + struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id); if (!pr->flags.power_setup_done) return -EINVAL; @@ -951,6 +968,7 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr, for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) { cx = &pr->power.states[i]; + state_usage = &dev->states_usage[count]; if (!cx->valid) continue; @@ -961,7 +979,8 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr, !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) continue; #endif - acpi_cstate[count] = cx; + + cpuidle_set_statedata(state_usage, cx); count++; if (count == CPUIDLE_STATE_MAX) @@ -1085,7 +1104,7 @@ int acpi_processor_hotplug(struct acpi_processor *pr) cpuidle_disable_device(dev); acpi_processor_get_power_info(pr); if (pr->flags.power) { - acpi_processor_setup_cpuidle_cx(pr, dev); + acpi_processor_setup_cpuidle_cx(pr); ret = cpuidle_enable_device(dev); } cpuidle_resume_and_unlock(); @@ -1143,8 +1162,8 @@ int acpi_processor_cst_has_changed(struct acpi_processor *pr) continue; acpi_processor_get_power_info(_pr); if (_pr->flags.power) { + acpi_processor_setup_cpuidle_cx(_pr); dev = per_cpu(acpi_cpuidle_device, cpu); - acpi_processor_setup_cpuidle_cx(_pr, dev); cpuidle_enable_device(dev); } } @@ -1213,7 +1232,7 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr) return -ENOMEM; per_cpu(acpi_cpuidle_device, pr->id) = dev; - acpi_processor_setup_cpuidle_cx(pr, dev); + acpi_processor_setup_cpuidle_cx(pr); /* Register per-cpu cpuidle_device. Cpuidle driver * must already be registered before registering device diff --git a/trunk/drivers/idle/intel_idle.c b/trunk/drivers/idle/intel_idle.c index 5d6675013864..2df9414a72f7 100644 --- a/trunk/drivers/idle/intel_idle.c +++ b/trunk/drivers/idle/intel_idle.c @@ -74,7 +74,7 @@ static struct cpuidle_driver intel_idle_driver = { .en_core_tk_irqen = 1, }; /* intel_idle.max_cstate=0 disables driver */ -static int max_cstate = CPUIDLE_STATE_MAX - 1; +static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; static unsigned int mwait_substates; @@ -90,7 +90,6 @@ struct idle_cpu { * Indicate which enable bits to clear here. */ unsigned long auto_demotion_disable_flags; - bool disable_promotion_to_c1e; }; static const struct idle_cpu *icpu; @@ -109,207 +108,163 @@ static struct cpuidle_state *cpuidle_state_table; */ #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 -/* - * MWAIT takes an 8-bit "hint" in EAX "suggesting" - * the C-state (top nibble) and sub-state (bottom nibble) - * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc. - * - * We store the hint at the top of our "flags" for each state. - */ -#define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) -#define MWAIT2flg(eax) ((eax & 0xFF) << 24) - /* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. * Thus C0 is a dummy. */ -static struct cpuidle_state nehalem_cstates[CPUIDLE_STATE_MAX] = { - { +static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ .name = "C1-NHM", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 3, .target_residency = 6, .enter = &intel_idle }, - { - .name = "C1E-NHM", - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 10, - .target_residency = 20, - .enter = &intel_idle }, - { + { /* MWAIT C2 */ .name = "C3-NHM", .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 20, .target_residency = 80, .enter = &intel_idle }, - { + { /* MWAIT C3 */ .name = "C6-NHM", .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 200, .target_residency = 800, .enter = &intel_idle }, - { - .enter = NULL } }; -static struct cpuidle_state snb_cstates[CPUIDLE_STATE_MAX] = { - { +static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ .name = "C1-SNB", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 2, - .target_residency = 2, - .enter = &intel_idle }, - { - .name = "C1E-SNB", - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 10, - .target_residency = 20, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .target_residency = 1, .enter = &intel_idle }, - { + { /* MWAIT C2 */ .name = "C3-SNB", .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 211, .enter = &intel_idle }, - { + { /* MWAIT C3 */ .name = "C6-SNB", .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 104, .target_residency = 345, .enter = &intel_idle }, - { + { /* MWAIT C4 */ .name = "C7-SNB", .desc = "MWAIT 0x30", - .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 109, .target_residency = 345, .enter = &intel_idle }, - { - .enter = NULL } }; -static struct cpuidle_state ivb_cstates[CPUIDLE_STATE_MAX] = { - { +static struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ .name = "C1-IVB", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle }, - { - .name = "C1E-IVB", - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 10, - .target_residency = 20, - .enter = &intel_idle }, - { + { /* MWAIT C2 */ .name = "C3-IVB", .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 59, .target_residency = 156, .enter = &intel_idle }, - { + { /* MWAIT C3 */ .name = "C6-IVB", .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 300, .enter = &intel_idle }, - { + { /* MWAIT C4 */ .name = "C7-IVB", .desc = "MWAIT 0x30", - .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 87, .target_residency = 300, .enter = &intel_idle }, - { - .enter = NULL } -}; - -static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = { - { - .name = "C1-HSW", - .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 2, - .target_residency = 2, - .enter = &intel_idle }, - { - .name = "C1E-HSW", - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 10, - .target_residency = 20, - .enter = &intel_idle }, - { - .name = "C3-HSW", - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, - .target_residency = 100, - .enter = &intel_idle }, - { - .name = "C6-HSW", - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, - .target_residency = 400, - .enter = &intel_idle }, - { - .name = "C7s-HSW", - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, - .target_residency = 500, - .enter = &intel_idle }, - { - .enter = NULL } }; -static struct cpuidle_state atom_cstates[CPUIDLE_STATE_MAX] = { - { - .name = "C1E-ATM", +static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ + .name = "C1-ATM", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 10, - .target_residency = 20, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .target_residency = 4, .enter = &intel_idle }, - { + { /* MWAIT C2 */ .name = "C2-ATM", .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID, + .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 20, .target_residency = 80, .enter = &intel_idle }, - { + { /* MWAIT C3 */ }, + { /* MWAIT C4 */ .name = "C4-ATM", .desc = "MWAIT 0x30", - .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 100, .target_residency = 400, .enter = &intel_idle }, - { + { /* MWAIT C5 */ }, + { /* MWAIT C6 */ .name = "C6-ATM", .desc = "MWAIT 0x52", - .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 140, .target_residency = 560, .enter = &intel_idle }, - { - .enter = NULL } }; +static long get_driver_data(int cstate) +{ + int driver_data; + switch (cstate) { + + case 1: /* MWAIT C1 */ + driver_data = 0x00; + break; + case 2: /* MWAIT C2 */ + driver_data = 0x10; + break; + case 3: /* MWAIT C3 */ + driver_data = 0x20; + break; + case 4: /* MWAIT C4 */ + driver_data = 0x30; + break; + case 5: /* MWAIT C5 */ + driver_data = 0x40; + break; + case 6: /* MWAIT C6 */ + driver_data = 0x52; + break; + default: + driver_data = 0x00; + } + return driver_data; +} + /** * intel_idle * @dev: cpuidle_device @@ -323,7 +278,8 @@ static int intel_idle(struct cpuidle_device *dev, { unsigned long ecx = 1; /* break on interrupt flag */ struct cpuidle_state *state = &drv->states[index]; - unsigned long eax = flg2MWAIT(state->flags); + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage); unsigned int cstate; int cpu = smp_processor_id(); @@ -406,19 +362,10 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } -static void c1e_promotion_disable(void *dummy) -{ - unsigned long long msr_bits; - - rdmsrl(MSR_IA32_POWER_CTL, msr_bits); - msr_bits &= ~0x2; - wrmsrl(MSR_IA32_POWER_CTL, msr_bits); -} static const struct idle_cpu idle_cpu_nehalem = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, - .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_atom = { @@ -432,17 +379,10 @@ static const struct idle_cpu idle_cpu_lincroft = { static const struct idle_cpu idle_cpu_snb = { .state_table = snb_cstates, - .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_ivb = { .state_table = ivb_cstates, - .disable_promotion_to_c1e = true, -}; - -static const struct idle_cpu idle_cpu_hsw = { - .state_table = hsw_cstates, - .disable_promotion_to_c1e = true, }; #define ICPU(model, cpu) \ @@ -462,9 +402,6 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x2d, idle_cpu_snb), ICPU(0x3a, idle_cpu_ivb), ICPU(0x3e, idle_cpu_ivb), - ICPU(0x3c, idle_cpu_hsw), - ICPU(0x3f, idle_cpu_hsw), - ICPU(0x45, idle_cpu_hsw), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); @@ -547,31 +484,32 @@ static int intel_idle_cpuidle_driver_init(void) drv->state_count = 1; - for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { - int num_substates, mwait_hint, mwait_cstate, mwait_substate; + for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { + int num_substates; - if (cpuidle_state_table[cstate].enter == NULL) - break; - - if (cstate + 1 > max_cstate) { + if (cstate > max_cstate) { printk(PREFIX "max_cstate %d reached\n", max_cstate); break; } - mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); - mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); - mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); - /* does the state exist in CPUID.MWAIT? */ - num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) + num_substates = (mwait_substates >> ((cstate) * 4)) & MWAIT_SUBSTATE_MASK; - - /* if sub-state in table is not enumerated by CPUID */ - if ((mwait_substate + 1) > num_substates) + if (num_substates == 0) + continue; + /* is the state not enabled? */ + if (cpuidle_state_table[cstate].enter == NULL) { + /* does the driver not know about the state? */ + if (*cpuidle_state_table[cstate].name == '\0') + pr_debug(PREFIX "unaware of model 0x%x" + " MWAIT %d please" + " contact lenb@kernel.org\n", + boot_cpu_data.x86_model, cstate); continue; + } - if (((mwait_cstate + 1) > 2) && + if ((cstate > 2) && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halts in idle" " states deeper than C2"); @@ -585,9 +523,6 @@ static int intel_idle_cpuidle_driver_init(void) if (icpu->auto_demotion_disable_flags) on_each_cpu(auto_demotion_disable, NULL, 1); - if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ - on_each_cpu(c1e_promotion_disable, NULL, 1); - return 0; } @@ -606,28 +541,25 @@ static int intel_idle_cpu_init(int cpu) dev->state_count = 1; - for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { - int num_substates, mwait_hint, mwait_cstate, mwait_substate; - - if (cpuidle_state_table[cstate].enter == NULL) - continue; + for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { + int num_substates; - if (cstate + 1 > max_cstate) { + if (cstate > max_cstate) { printk(PREFIX "max_cstate %d reached\n", max_cstate); break; } - mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); - mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); - mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); - /* does the state exist in CPUID.MWAIT? */ - num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) - & MWAIT_SUBSTATE_MASK; - - /* if sub-state in table is not enumerated by CPUID */ - if ((mwait_substate + 1) > num_substates) + num_substates = (mwait_substates >> ((cstate) * 4)) + & MWAIT_SUBSTATE_MASK; + if (num_substates == 0) continue; + /* is the state not enabled? */ + if (cpuidle_state_table[cstate].enter == NULL) + continue; + + dev->states_usage[dev->state_count].driver_data = + (void *)get_driver_data(cstate); dev->state_count += 1; } diff --git a/trunk/include/linux/cpuidle.h b/trunk/include/linux/cpuidle.h index 480c14dc1ddd..24cd1037b6d6 100644 --- a/trunk/include/linux/cpuidle.h +++ b/trunk/include/linux/cpuidle.h @@ -32,6 +32,8 @@ struct cpuidle_driver; ****************************/ struct cpuidle_state_usage { + void *driver_data; + unsigned long long disable; unsigned long long usage; unsigned long long time; /* in US */ @@ -60,6 +62,26 @@ struct cpuidle_state { #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) +/** + * cpuidle_get_statedata - retrieves private driver state data + * @st_usage: the state usage statistics + */ +static inline void *cpuidle_get_statedata(struct cpuidle_state_usage *st_usage) +{ + return st_usage->driver_data; +} + +/** + * cpuidle_set_statedata - stores private driver state data + * @st_usage: the state usage statistics + * @data: the private data + */ +static inline void +cpuidle_set_statedata(struct cpuidle_state_usage *st_usage, void *data) +{ + st_usage->driver_data = data; +} + struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; diff --git a/trunk/include/linux/pm.h b/trunk/include/linux/pm.h index 97bcf23e045a..03d7bb145311 100644 --- a/trunk/include/linux/pm.h +++ b/trunk/include/linux/pm.h @@ -31,6 +31,7 @@ /* * Callbacks for platform drivers to implement. */ +extern void (*pm_idle)(void); extern void (*pm_power_off)(void); extern void (*pm_power_off_prepare)(void); diff --git a/trunk/tools/power/x86/turbostat/turbostat.8 b/trunk/tools/power/x86/turbostat/turbostat.8 index b4ddb748356c..0d7dc2cfefb5 100644 --- a/trunk/tools/power/x86/turbostat/turbostat.8 +++ b/trunk/tools/power/x86/turbostat/turbostat.8 @@ -31,6 +31,8 @@ The \fB-S\fP option limits output to a 1-line System Summary for each interval. .PP The \fB-v\fP option increases verbosity. .PP +The \fB-s\fP option prints the SMI counter, equivalent to "-c 0x34" +.PP The \fB-c MSR#\fP option includes the delta of the specified 32-bit MSR counter. .PP The \fB-C MSR#\fP option includes the delta of the specified 64-bit MSR counter. @@ -184,24 +186,26 @@ This is a weighted average, where the weight is %c0. ie. it is the total number un-halted cycles elapsed per time divided by the number of CPUs. .SH SMI COUNTING EXAMPLE On Intel Nehalem and newer processors, MSR 0x34 is a System Management Mode Interrupt (SMI) counter. -This counter is shown by default under the "SMI" column. +Using the -m option, you can display how many SMIs have fired since reset, or if there +are SMIs during the measurement interval, you can display the delta using the -d option. .nf -[root@x980 ~]# turbostat -cor CPU %c0 GHz TSC SMI %c1 %c3 %c6 CTMP %pc3 %pc6 - 0.11 1.91 3.38 0 1.84 0.26 97.79 29 0.82 83.87 - 0 0 0.40 1.63 3.38 0 10.27 0.12 89.20 20 0.82 83.88 - 0 6 0.06 1.63 3.38 0 10.61 - 1 2 0.37 2.63 3.38 0 0.02 0.10 99.51 22 - 1 8 0.01 1.62 3.38 0 0.39 - 2 4 0.07 1.62 3.38 0 0.04 0.07 99.82 23 - 2 10 0.02 1.62 3.38 0 0.09 - 8 1 0.23 1.64 3.38 0 0.10 1.07 98.60 24 - 8 7 0.02 1.64 3.38 0 0.31 - 9 3 0.03 1.62 3.38 0 0.03 0.05 99.89 29 - 9 9 0.02 1.62 3.38 0 0.05 - 10 5 0.07 1.62 3.38 0 0.08 0.12 99.73 27 - 10 11 0.03 1.62 3.38 0 0.13 +[root@x980 ~]# turbostat -m 0x34 +cor CPU %c0 GHz TSC MSR 0x034 %c1 %c3 %c6 %pc3 %pc6 + 1.41 1.82 3.38 0x00000000 8.92 37.82 51.85 17.37 0.55 + 0 0 3.73 2.03 3.38 0x00000055 1.72 48.25 46.31 17.38 0.55 + 0 6 0.14 1.63 3.38 0x00000056 5.30 + 1 2 2.51 1.80 3.38 0x00000056 15.65 29.33 52.52 + 1 8 0.10 1.65 3.38 0x00000056 18.05 + 2 4 1.16 1.68 3.38 0x00000056 5.87 24.47 68.50 + 2 10 0.10 1.63 3.38 0x00000056 6.93 + 8 1 3.84 1.91 3.38 0x00000056 1.36 50.65 44.16 + 8 7 0.08 1.64 3.38 0x00000056 5.12 + 9 3 1.82 1.73 3.38 0x00000056 7.59 24.21 66.38 + 9 9 0.09 1.68 3.38 0x00000056 9.32 + 10 5 1.66 1.65 3.38 0x00000056 15.10 50.00 33.23 + 10 11 1.72 1.65 3.38 0x00000056 15.05 ^C +[root@x980 ~]# .fi .SH NOTES diff --git a/trunk/tools/power/x86/turbostat/turbostat.c b/trunk/tools/power/x86/turbostat/turbostat.c index 6f3214ed4444..ce6d46038f74 100644 --- a/trunk/tools/power/x86/turbostat/turbostat.c +++ b/trunk/tools/power/x86/turbostat/turbostat.c @@ -58,7 +58,6 @@ unsigned int extra_msr_offset32; unsigned int extra_msr_offset64; unsigned int extra_delta_offset32; unsigned int extra_delta_offset64; -int do_smi; double bclk; unsigned int show_pkg; unsigned int show_core; @@ -100,7 +99,6 @@ struct thread_data { unsigned long long extra_delta64; unsigned long long extra_msr32; unsigned long long extra_delta32; - unsigned int smi_count; unsigned int cpu_id; unsigned int flags; #define CPU_IS_FIRST_THREAD_IN_CORE 0x2 @@ -250,8 +248,6 @@ void print_header(void) if (has_aperf) outp += sprintf(outp, " GHz"); outp += sprintf(outp, " TSC"); - if (do_smi) - outp += sprintf(outp, " SMI"); if (extra_delta_offset32) outp += sprintf(outp, " count 0x%03X", extra_delta_offset32); if (extra_delta_offset64) @@ -318,8 +314,6 @@ int dump_counters(struct thread_data *t, struct core_data *c, extra_msr_offset32, t->extra_msr32); fprintf(stderr, "msr0x%x: %016llX\n", extra_msr_offset64, t->extra_msr64); - if (do_smi) - fprintf(stderr, "SMI: %08X\n", t->smi_count); } if (c) { @@ -358,7 +352,6 @@ int dump_counters(struct thread_data *t, struct core_data *c, * RAM_W: %5.2 * GHz: "GHz" 3 columns %3.2 * TSC: "TSC" 3 columns %3.2 - * SMI: "SMI" 4 columns %4d * percentage " %pc3" %6.2 * Perf Status percentage: %5.2 * "CTMP" 4 columns %4d @@ -438,10 +431,6 @@ int format_counters(struct thread_data *t, struct core_data *c, /* TSC */ outp += sprintf(outp, "%5.2f", 1.0 * t->tsc/units/interval_float); - /* SMI */ - if (do_smi) - outp += sprintf(outp, "%4d", t->smi_count); - /* delta */ if (extra_delta_offset32) outp += sprintf(outp, " %11llu", t->extra_delta32); @@ -656,9 +645,6 @@ delta_thread(struct thread_data *new, struct thread_data *old, */ old->extra_msr32 = new->extra_msr32; old->extra_msr64 = new->extra_msr64; - - if (do_smi) - old->smi_count = new->smi_count - old->smi_count; } int delta_cpu(struct thread_data *t, struct core_data *c, @@ -686,7 +672,6 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->mperf = 0; t->c1 = 0; - t->smi_count = 0; t->extra_delta32 = 0; t->extra_delta64 = 0; @@ -817,11 +802,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -4; } - if (do_smi) { - if (get_msr(cpu, MSR_SMI_COUNT, &msr)) - return -5; - t->smi_count = msr & 0xFFFFFFFF; - } if (extra_delta_offset32) { if (get_msr(cpu, extra_delta_offset32, &msr)) return -5; @@ -928,7 +908,8 @@ void print_verbose_header(void) get_msr(0, MSR_NHM_PLATFORM_INFO, &msr); - fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); + if (verbose) + fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); ratio = (msr >> 40) & 0xFF; fprintf(stderr, "%d * %.0f = %.0f MHz max efficiency\n", @@ -938,16 +919,13 @@ void print_verbose_header(void) fprintf(stderr, "%d * %.0f = %.0f MHz TSC frequency\n", ratio, bclk, ratio * bclk); - get_msr(0, MSR_IA32_POWER_CTL, &msr); - fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E: %sabled)\n", - msr, msr & 0x2 ? "EN" : "DIS"); - if (!do_ivt_turbo_ratio_limit) goto print_nhm_turbo_ratio_limits; get_msr(0, MSR_IVT_TURBO_RATIO_LIMIT, &msr); - fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); + if (verbose) + fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -1038,7 +1016,8 @@ void print_verbose_header(void) get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr); - fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); + if (verbose) + fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -1418,9 +1397,6 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model) case 0x2D: /* SNB Xeon */ case 0x3A: /* IVB */ case 0x3E: /* IVB Xeon */ - case 0x3C: /* HSW */ - case 0x3F: /* HSW */ - case 0x45: /* HSW */ return 1; case 0x2E: /* Nehalem-EX Xeon - Beckton */ case 0x2F: /* Westmere-EX Xeon - Eagleton */ @@ -1512,9 +1488,6 @@ void rapl_probe(unsigned int family, unsigned int model) switch (model) { case 0x2A: case 0x3A: - case 0x3C: /* HSW */ - case 0x3F: /* HSW */ - case 0x45: /* HSW */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX; break; case 0x2D: @@ -1751,9 +1724,6 @@ int is_snb(unsigned int family, unsigned int model) case 0x2D: case 0x3A: /* IVB */ case 0x3E: /* IVB Xeon */ - case 0x3C: /* HSW */ - case 0x3F: /* HSW */ - case 0x45: /* HSW */ return 1; } return 0; @@ -1913,7 +1883,6 @@ void check_cpuid() do_nehalem_platform_info = genuine_intel && has_invariant_tsc; do_nhm_cstates = genuine_intel; /* all Intel w/ non-stop TSC have NHM counters */ - do_smi = do_nhm_cstates; do_snb_cstates = is_snb(family, model); bclk = discover_bclk(family, model); @@ -2250,6 +2219,9 @@ void cmdline(int argc, char **argv) case 'c': sscanf(optarg, "%x", &extra_delta_offset32); break; + case 's': + extra_delta_offset32 = 0x34; /* SMI counter */ + break; case 'C': sscanf(optarg, "%x", &extra_delta_offset64); break; @@ -2276,7 +2248,7 @@ int main(int argc, char **argv) cmdline(argc, argv); if (verbose) - fprintf(stderr, "turbostat v3.2 February 11, 2013" + fprintf(stderr, "turbostat v3.0 November 23, 2012" " - Len Brown \n"); turbostat_init();