diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 40d156a31676e..00a831d56c500 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -721,6 +721,7 @@ void arch_cpu_idle(void) { x86_idle(); } +EXPORT_SYMBOL_GPL(arch_cpu_idle); /* * We use this if we don't have any better idle routine.. diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 50e726b6c2cfa..b29be7d4d7d0e 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1864,6 +1864,10 @@ static bool pm_runtime_need_not_resume(struct device *dev) * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. + * + * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent + * state where this function has called the ->runtime_suspend callback but the + * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index ff71dd662880d..cac5997dca505 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -74,6 +74,7 @@ endmenu config HALTPOLL_CPUIDLE tristate "Halt poll cpuidle driver" depends on X86 && KVM_GUEST + select CPU_IDLE_GOV_HALTPOLL default y help This option enables halt poll cpuidle driver, which allows to poll diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm index 747aa537389b9..a1ee475d180da 100644 --- a/drivers/cpuidle/Kconfig.arm +++ b/drivers/cpuidle/Kconfig.arm @@ -24,6 +24,14 @@ config ARM_PSCI_CPUIDLE It provides an idle driver that is capable of detecting and managing idle states through the PSCI firmware interface. + The driver has limitations when used with PREEMPT_RT: + - If the idle states are described with the non-hierarchical layout, + all idle states are still available. + + - If the idle states are described with the hierarchical layout, + only the idle states defined per CPU are available, but not the ones + being shared among a group of CPUs (aka cluster idle states). + config ARM_PSCI_CPUIDLE_DOMAIN bool "PSCI CPU idle Domain" depends on ARM_PSCI_CPUIDLE @@ -102,6 +110,7 @@ config ARM_MVEBU_V7_CPUIDLE config ARM_TEGRA_CPUIDLE bool "CPU Idle Driver for NVIDIA Tegra SoCs" depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP select ARM_CPU_SUSPEND help @@ -110,6 +119,7 @@ config ARM_TEGRA_CPUIDLE config ARM_QCOM_SPM_CPUIDLE bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)" depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE select ARM_CPU_SUSPEND select CPU_IDLE_MULTIPLE_DRIVERS select DT_IDLE_STATES diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 3a39a7f48b771..e66df22f96955 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -32,7 +32,7 @@ static int default_enter_idle(struct cpuidle_device *dev, local_irq_enable(); return index; } - default_idle(); + arch_cpu_idle(); return index; } diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index c80cf9ddabd8a..6ad2954948a5a 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -64,8 +64,11 @@ static int psci_pd_init(struct device_node *np, bool use_osi) pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; - /* Allow power off when OSI has been successfully enabled. */ - if (use_osi) + /* + * Allow power off when OSI has been successfully enabled. + * PREEMPT_RT is not yet ready to enter domain idle states. + */ + if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT)) pd->power_off = psci_pd_power_off; else pd->flags |= GENPD_FLAG_ALWAYS_ON; diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index 57bc3e3ae3912..68467a325dcb1 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -231,6 +231,9 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, if (!psci_has_osi_support()) return 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return 0; + data->dev = psci_dt_attach_cpu(cpu); if (IS_ERR_OR_NULL(data->dev)) return PTR_ERR_OR_ZERO(data->dev); diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index f70aa17e2a8e0..d9cda7f6ccb98 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -183,11 +183,15 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) s->target_residency_ns = s->target_residency * NSEC_PER_USEC; else if (s->target_residency_ns < 0) s->target_residency_ns = 0; + else + s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC); if (s->exit_latency > 0) s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; else if (s->exit_latency_ns < 0) s->exit_latency_ns = 0; + else + s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); } } diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index d9262db79cae5..987fc5f3997dc 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -2,8 +2,13 @@ /* * Timer events oriented CPU idle governor * + * TEO governor: * Copyright (C) 2018 - 2021 Intel Corporation * Author: Rafael J. Wysocki + * + * Util-awareness mechanism: + * Copyright (C) 2022 Arm Ltd. + * Author: Kajetan Puchalski */ /** @@ -99,14 +104,55 @@ * select the given idle state instead of the candidate one. * * 3. By default, select the candidate state. + * + * Util-awareness mechanism: + * + * The idea behind the util-awareness extension is that there are two distinct + * scenarios for the CPU which should result in two different approaches to idle + * state selection - utilized and not utilized. + * + * In this case, 'utilized' means that the average runqueue util of the CPU is + * above a certain threshold. + * + * When the CPU is utilized while going into idle, more likely than not it will + * be woken up to do more work soon and so a shallower idle state should be + * selected to minimise latency and maximise performance. When the CPU is not + * being utilized, the usual metrics-based approach to selecting the deepest + * available idle state should be preferred to take advantage of the power + * saving. + * + * In order to achieve this, the governor uses a utilization threshold. + * The threshold is computed per-CPU as a percentage of the CPU's capacity + * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%) + * seems to be getting the best results. + * + * Before selecting the next idle state, the governor compares the current CPU + * util to the precomputed util threshold. If it's below, it defaults to the + * TEO metrics mechanism. If it's above, the closest shallower idle state will + * be selected instead, as long as is not a polling state. */ #include #include #include +#include #include +#include #include +/* + * The number of bits to shift the CPU's capacity by in order to determine + * the utilized threshold. + * + * 6 was chosen based on testing as the number that achieved the best balance + * of power and performance on average. + * + * The resulting threshold is high enough to not be triggered by background + * noise and low enough to react quickly when activity starts to ramp up. + */ +#define UTIL_THRESHOLD_SHIFT 6 + + /* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. @@ -137,9 +183,11 @@ struct teo_bin { * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. - * @total: Grand total of the "intercepts" and "hits" mertics for all bins. + * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @next_recent_idx: Index of the next @recent_idx entry to update. * @recent_idx: Indices of bins corresponding to recent "intercepts". + * @util_threshold: Threshold above which the CPU is considered utilized + * @utilized: Whether the last sleep on the CPU happened while utilized */ struct teo_cpu { s64 time_span_ns; @@ -148,10 +196,29 @@ struct teo_cpu { unsigned int total; int next_recent_idx; int recent_idx[NR_RECENT]; + unsigned long util_threshold; + bool utilized; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); +/** + * teo_cpu_is_utilized - Check if the CPU's util is above the threshold + * @cpu: Target CPU + * @cpu_data: Governor CPU data for the target CPU + */ +#ifdef CONFIG_SMP +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return sched_cpu_util(cpu) > cpu_data->util_threshold; +} +#else +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return false; +} +#endif + /** * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. @@ -258,15 +325,17 @@ static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) * @dev: Target CPU. * @state_idx: Index of the capping idle state. * @duration_ns: Idle duration value to match. + * @no_poll: Don't consider polling states. */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - s64 duration_ns) + s64 duration_ns, bool no_poll) { int i; for (i = state_idx - 1; i >= 0; i--) { - if (dev->states_usage[i].disable) + if (dev->states_usage[i].disable || + (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) continue; state_idx = i; @@ -321,6 +390,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } + cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); + /* + * If the CPU is being utilized over the threshold and there are only 2 + * states to choose from, the metrics need not be considered, so choose + * the shallowest non-polling state and exit. + */ + if (drv->state_count < 3 && cpu_data->utilized) { + for (i = 0; i < drv->state_count; ++i) { + if (!dev->states_usage[i].disable && + !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) { + idx = i; + goto end; + } + } + } + /* * Find the deepest idle state whose target residency does not exceed * the current sleep length and the deepest idle state not deeper than @@ -452,6 +537,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx > constraint_idx) idx = constraint_idx; + /* + * If the CPU is being utilized over the threshold, choose a shallower + * non-polling state to improve latency + */ + if (cpu_data->utilized) + idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true); + end: /* * Don't stop the tick if the selected state is a polling one or if the @@ -469,7 +561,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (idx > idx0 && drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick); + idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); } return idx; @@ -508,9 +600,11 @@ static int teo_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu); int i; memset(cpu_data, 0, sizeof(*cpu_data)); + cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT; for (i = 0; i < NR_RECENT; i++) cpu_data->recent_idx[i] = -1; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 2b496a53cbca1..48948b1717494 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -200,7 +200,7 @@ static void cpuidle_sysfs_release(struct kobject *kobj) complete(&kdev->kobj_unregister); } -static struct kobj_type ktype_cpuidle = { +static const struct kobj_type ktype_cpuidle = { .sysfs_ops = &cpuidle_sysfs_ops, .release = cpuidle_sysfs_release, }; @@ -447,7 +447,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj) complete(&state_obj->kobj_unregister); } -static struct kobj_type ktype_state_cpuidle = { +static const struct kobj_type ktype_state_cpuidle = { .sysfs_ops = &cpuidle_state_sysfs_ops, .default_groups = cpuidle_state_default_groups, .release = cpuidle_state_sysfs_release, @@ -594,7 +594,7 @@ static struct attribute *cpuidle_driver_default_attrs[] = { }; ATTRIBUTE_GROUPS(cpuidle_driver_default); -static struct kobj_type ktype_driver_cpuidle = { +static const struct kobj_type ktype_driver_cpuidle = { .sysfs_ops = &cpuidle_driver_sysfs_ops, .default_groups = cpuidle_driver_default_groups, .release = cpuidle_driver_sysfs_release, diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cfeb24d40d378..bb3d10099ba44 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1430,6 +1430,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &idle_cpu_adl_n), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), @@ -1862,6 +1863,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) skx_idle_state_table_update(); break; case INTEL_FAM6_SAPPHIRERAPIDS_X: + case INTEL_FAM6_EMERALDRAPIDS_X: spr_idle_state_table_update(); break; case INTEL_FAM6_ALDERLAKE: diff --git a/include/linux/pm.h b/include/linux/pm.h index 93cd34f00822c..035d9649eba48 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -379,9 +379,13 @@ const struct dev_pm_ops name = { \ const struct dev_pm_ops name; \ __EXPORT_SYMBOL(name, sec, ns); \ const struct dev_pm_ops name +#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name) +#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, ns) #else #define _EXPORT_DEV_PM_OPS(name, sec, ns) \ static __maybe_unused const struct dev_pm_ops __static_##name +#define EXPORT_PM_FN_GPL(name) +#define EXPORT_PM_FN_NS_GPL(name, ns) #endif #define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "") diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 60a1d3051cc79..4b31629c5be4b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -118,7 +118,6 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM - select SRCU config PM_SLEEP_SMP def_bool y diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 277434b6c0bfd..36a1df48280c3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle, return ret; } -/** +/* * Structure used for CRC32. */ struct crc_data { @@ -596,7 +596,7 @@ struct crc_data { unsigned char *unc[LZO_THREADS]; /* uncompressed data */ }; -/** +/* * CRC32 update function that runs in its own thread. */ static int crc32_threadfn(void *data) @@ -623,7 +623,7 @@ static int crc32_threadfn(void *data) } return 0; } -/** +/* * Structure used for LZO data compression. */ struct cmp_data { @@ -640,7 +640,7 @@ struct cmp_data { unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ }; -/** +/* * Compression function that runs in its own thread. */ static int lzo_compress_threadfn(void *data) @@ -948,9 +948,9 @@ int swsusp_write(unsigned int flags) return error; } -/** +/* * The following functions allow us to read data using a swap map - * in a file-alike way + * in a file-like way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle, return ret; } -/** +/* * Structure used for LZO data decompression. */ struct dec_data { @@ -1123,7 +1123,7 @@ struct dec_data { unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ }; -/** +/* * Decompression function that runs in its own thread. */ static int lzo_decompress_threadfn(void *data)