From 81e6737518a1e392ead4e568a4ee70bb7c371458 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 23 Jun 2020 14:31:31 +0800
Subject: [PATCH 1/3] PM: s2idle: Clear _TIF_POLLING_NRFLAG before suspend to
 idle

Suspend to idle was found to not work on Goldmont CPU recently.

The issue happens due to:

 1. On Goldmont the CPU in idle can only be woken up via IPIs,
    not POLLING mode, due to commit 08e237fa56a1 ("x86/cpu: Add
    workaround for MONITOR instruction erratum on Goldmont based
    CPUs")

 2. When the CPU is entering suspend to idle process, the
    _TIF_POLLING_NRFLAG remains on, because cpuidle_enter_s2idle()
    doesn't match call_cpuidle() exactly.

 3. Commit b2a02fc43a1f ("smp: Optimize send_call_function_single_ipi()")
    makes use of _TIF_POLLING_NRFLAG to avoid sending IPIs to idle
    CPUs.

 4. As a result, some IPIs related functions might not work
    well during suspend to idle on Goldmont. For example, one
    suspected victim:

    tick_unfreeze() -> timekeeping_resume() -> hrtimers_resume()
    -> clock_was_set() -> on_each_cpu() might wait forever,
    because the IPIs will not be sent to the CPUs which are
    sleeping with _TIF_POLLING_NRFLAG set, and Goldmont CPU
    could not be woken up by only setting _TIF_NEED_RESCHED
    on the monitor address.

To avoid that, clear the _TIF_POLLING_NRFLAG flag before invoking
enter_s2idle_proper() in cpuidle_enter_s2idle() in analogy with the
call_cpuidle() code flow.

Fixes: b2a02fc43a1f ("smp: Optimize send_call_function_single_ipi()")
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
[ rjw: Subject / changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c149d9e20dfde..e092789187c63 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -13,6 +13,7 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/idle.h>
 #include <linux/notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/cpu.h>
@@ -186,7 +187,7 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * be frozen safely.
 	 */
 	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
-	if (index > 0)
+	if (index > 0 && !current_clr_polling_and_test())
 		enter_s2idle_proper(drv, dev, index);
 
 	return index;

From 589bab6bb30c8cd43a7da6dcf43cc3a586cc2e25 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 12 Jun 2020 11:09:57 -0700
Subject: [PATCH 2/3] cpufreq: intel_pstate: Add one more OOB control bit

Add one more bit for OOB (Out Of Band) enabling of P-states.

If OOB handling of P-states is enabled, intel_pstate shouldn't load.
Currently, only "BIT(8) == 1" of the MSR MSR_MISC_PWR_MGMT is
considered as OOB, but "BIT(18) == 1" needs to be taken into
consideration as OOB condition too.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw: Add an empty code line, edit subject and changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 8e23a698ce048..e771e8b4f99f0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2677,6 +2677,8 @@ static struct acpi_platform_list plat_info[] __initdata = {
 	{ } /* End */
 };
 
+#define BITMASK_OOB	(BIT(8) | BIT(18))
+
 static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
 {
 	const struct x86_cpu_id *id;
@@ -2686,8 +2688,9 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
 	id = x86_match_cpu(intel_pstate_cpu_oob_ids);
 	if (id) {
 		rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
-		if (misc_pwr & (1 << 8)) {
-			pr_debug("Bit 8 in the MISC_PWR_MGMT MSR set\n");
+		if (misc_pwr & BITMASK_OOB) {
+			pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n");
+			pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n");
 			return true;
 		}
 	}

From 10e8b11eb3195e11450c509d4dd3984d707a4167 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 25 Jun 2020 13:52:53 +0200
Subject: [PATCH 3/3] cpuidle: Rearrange s2idle-specific idle state entry code

Implement call_cpuidle_s2idle() in analogy with call_cpuidle()
for the s2idle-specific idle state entry and invoke it from
cpuidle_idle_call() to make the s2idle-specific idle entry code
path look more similar to the "regular" idle entry one.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Chen Yu <yu.c.chen@intel.com>
---
 drivers/cpuidle/cpuidle.c |  6 +++---
 kernel/sched/idle.c       | 15 +++++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index e092789187c63..87197319ab069 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -13,7 +13,6 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
-#include <linux/sched/idle.h>
 #include <linux/notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/cpu.h>
@@ -187,9 +186,10 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * be frozen safely.
 	 */
 	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
-	if (index > 0 && !current_clr_polling_and_test())
+	if (index > 0) {
 		enter_s2idle_proper(drv, dev, index);
-
+		local_irq_enable();
+	}
 	return index;
 }
 #endif /* CONFIG_SUSPEND */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 05deb81bb3e3d..1ae95b9150d3c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -96,6 +96,15 @@ void __cpuidle default_idle_call(void)
 	}
 }
 
+static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
+			       struct cpuidle_device *dev)
+{
+	if (current_clr_polling_and_test())
+		return -EBUSY;
+
+	return cpuidle_enter_s2idle(drv, dev);
+}
+
 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		      int next_state)
 {
@@ -171,11 +180,9 @@ static void cpuidle_idle_call(void)
 		if (idle_should_enter_s2idle()) {
 			rcu_idle_enter();
 
-			entered_state = cpuidle_enter_s2idle(drv, dev);
-			if (entered_state > 0) {
-				local_irq_enable();
+			entered_state = call_cpuidle_s2idle(drv, dev);
+			if (entered_state > 0)
 				goto exit_idle;
-			}
 
 			rcu_idle_exit();