From d90167a941f62860f35eb960e1012aa2d30e7e94 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Thu, 10 Dec 2015 11:12:26 +0100
Subject: [PATCH 1/6] x86/mce: Ensure offline CPUs don't participate in
 rendezvous process

Intel's MCA implementation broadcasts MCEs to all CPUs on the
node. This poses a problem for offlined CPUs which cannot
participate in the rendezvous process:

  Kernel panic - not syncing: Timeout: Not all CPUs entered broadcast exception handler
  Kernel Offset: disabled
  Rebooting in 100 seconds..

More specifically, Linux does a soft offline of a CPU when
writing a 0 to /sys/devices/system/cpu/cpuX/online, which
doesn't prevent the #MC exception from being broadcasted to that
CPU.

Ensure that offline CPUs don't participate in the MCE rendezvous
and clear the RIP valid status bit so that a second MCE won't
cause a shutdown.

Without the patch, mce_start() will increment mce_callin and
wait for all CPUs. Offlined CPUs should avoid participating in
the rendezvous process altogether.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
[ Massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1449742346-21470-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index c5b0d562dbf55..7e8a736d09db1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -999,6 +999,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
+	/* If this CPU is offline, just bail out. */
+	if (cpu_is_offline(smp_processor_id())) {
+		u64 mcgstatus;
+
+		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+		if (mcgstatus & MCG_STATUS_RIPV) {
+			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+			return;
+		}
+	}
+
 	ist_enter(regs);
 
 	this_cpu_inc(mce_exception_count);

From 91e2eea98f94a2ebb143d4c4cdeaa4573d62dc17 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Thu, 19 Nov 2015 16:55:45 -0500
Subject: [PATCH 2/6] x86/xen: Avoid fast syscall path for Xen PV guests

After 32-bit syscall rewrite, and specifically after commit:

  5f310f739b4c ("x86/entry/32: Re-implement SYSENTER using the new C path")

... the stack frame that is passed to xen_sysexit is no longer a
"standard" one (i.e. it's not pt_regs).

Since we end up calling xen_iret from xen_sysexit we don't need
to fix up the stack and instead follow entry_SYSENTER_32's IRET
path directly to xen_iret.

We can do the same thing for compat mode even though stack does
not need to be fixed. This will allow us to drop usergs_sysret32
paravirt op (in the subsequent patch)

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1447970147-1733-2-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/entry_32.S         |  5 +++--
 arch/x86/entry/entry_64_compat.S  | 10 ++++++----
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/xen/enlighten.c          |  4 +++-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3eb572ed3d7ad..0870825a95685 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -308,8 +308,9 @@ sysenter_past_esp:
 
 	movl	%esp, %eax
 	call	do_fast_syscall_32
-	testl	%eax, %eax
-	jz	.Lsyscall_32_done
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 
 /* Opportunistic SYSEXIT */
 	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index c3201830a85ee..402e34a21559e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -121,8 +121,9 @@ sysenter_flags_fixed:
 
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
-	testl	%eax, %eax
-	jz	.Lsyscall_32_done
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 	jmp	sysret32_from_system_call
 
 sysenter_fix_flags:
@@ -200,8 +201,9 @@ ENTRY(entry_SYSCALL_compat)
 
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
-	testl	%eax, %eax
-	jz	.Lsyscall_32_done
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 
 	/* Opportunistic SYSRET */
 sysret32_from_system_call:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e4f8010f22e04..f7ba9fbf12eeb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -216,6 +216,7 @@
 #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
 #define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
 
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5774800ff583c..d315151411e56 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1886,8 +1886,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
 
 static void xen_set_cpu_features(struct cpuinfo_x86 *c)
 {
-	if (xen_pv_domain())
+	if (xen_pv_domain()) {
 		clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+		set_cpu_cap(c, X86_FEATURE_XENPV);
+	}
 }
 
 const struct hypervisor_x86 x86_hyper_xen = {

From d8c98a1d1488747625ad6044d423406e17e99b7a Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 11 Dec 2015 09:07:53 -0500
Subject: [PATCH 3/6] x86/paravirt: Prevent rtc_cmos platform device init on PV
 guests

Adding the rtc platform device in non-privileged Xen PV guests causes
an IRQ conflict because these guests do not have legacy PIC and may
allocate irqs in the legacy range.

In a single VCPU Xen PV guest we should have:

/proc/interrupts:
           CPU0
  0:       4934  xen-percpu-virq      timer0
  1:          0  xen-percpu-ipi       spinlock0
  2:          0  xen-percpu-ipi       resched0
  3:          0  xen-percpu-ipi       callfunc0
  4:          0  xen-percpu-virq      debug0
  5:          0  xen-percpu-ipi       callfuncsingle0
  6:          0  xen-percpu-ipi       irqwork0
  7:        321   xen-dyn-event     xenbus
  8:         90   xen-dyn-event     hvc_console
  ...

But hvc_console cannot get its interrupt because it is already in use
by rtc0 and the console does not work.

  genirq: Flags mismatch irq 8. 00000000 (hvc_console) vs. 00000000 (rtc0)

We can avoid this problem by realizing that unprivileged PV guests (both
Xen and lguests) are not supposed to have rtc_cmos device and so
adding it is not necessary.

Privileged guests (i.e. Xen's dom0) do use it but they should not have
irq conflicts since they allocate irqs above legacy range (above
gsi_top, in fact).

Instead of explicitly testing whether the guest is privileged we can
extend pv_info structure to include information about guest's RTC
support.

Reported-and-tested-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: vkuznets@redhat.com
Cc: xen-devel@lists.xenproject.org
Cc: konrad.wilk@oracle.com
Cc: stable@vger.kernel.org # 4.2+
Link: http://lkml.kernel.org/r/1449842873-2613-1-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       | 6 ++++++
 arch/x86/include/asm/paravirt_types.h | 5 +++++
 arch/x86/include/asm/processor.h      | 1 +
 arch/x86/kernel/rtc.c                 | 3 +++
 arch/x86/lguest/boot.c                | 1 +
 arch/x86/xen/enlighten.c              | 4 +++-
 6 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596433f89..c759b3cca6634 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -19,6 +19,12 @@ static inline int paravirt_enabled(void)
 	return pv_info.paravirt_enabled;
 }
 
+static inline int paravirt_has_feature(unsigned int feature)
+{
+	WARN_ON_ONCE(!pv_info.paravirt_enabled);
+	return (pv_info.features & feature);
+}
+
 static inline void load_sp0(struct tss_struct *tss,
 			     struct thread_struct *thread)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 31247b5bff7c8..3d44191185f8c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -70,9 +70,14 @@ struct pv_info {
 #endif
 
 	int paravirt_enabled;
+	unsigned int features;	  /* valid only if paravirt_enabled is set */
 	const char *name;
 };
 
+#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
+/* Supported features */
+#define PV_SUPPORTED_RTC        (1<<0)
+
 struct pv_init_ops {
 	/*
 	 * Patch may replace one of the defined code sequences with
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 67522256c7ffa..2d5a50cb61a2d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void)
 #else
 #define __cpuid			native_cpuid
 #define paravirt_enabled()	0
+#define paravirt_has(x) 	0
 
 static inline void load_sp0(struct tss_struct *tss,
 			    struct thread_struct *thread)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index cd9685235df91..4af8d063fb362 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void)
 	}
 #endif
 
+	if (paravirt_enabled() && !paravirt_has(RTC))
+		return -ENODEV;
+
 	platform_device_register(&rtc_device);
 	dev_info(&rtc_device.dev,
 		 "registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a0d09f6c65337..a43b2eafc466f 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1414,6 +1414,7 @@ __init void lguest_init(void)
 	pv_info.kernel_rpl = 1;
 	/* Everyone except Xen runs with this set. */
 	pv_info.shared_kernel_pmd = 1;
+	pv_info.features = 0;
 
 	/*
 	 * We set up all the lguest overrides for sensitive operations.  These
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d315151411e56..b7de78bdc09c1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1192,7 +1192,7 @@ static const struct pv_info xen_info __initconst = {
 #ifdef CONFIG_X86_64
 	.extra_user_64bit_cs = FLAT_USER_CS64,
 #endif
-
+	.features = 0,
 	.name = "Xen",
 };
 
@@ -1535,6 +1535,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
+	if (xen_initial_domain())
+		pv_info.features |= PV_SUPPORTED_RTC;
 	pv_init_ops = xen_init_ops;
 	pv_apic_ops = xen_apic_ops;
 	if (!xen_pvh_domain()) {

From 6a613ac6bc015f8ef75806d397d69dbac4a8d8c4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 16 Dec 2015 23:18:47 -0800
Subject: [PATCH 4/6] x86/entry: Fix some comments

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-and-tested-by: Borislav Petkov <bp@alien8.de>
Cc: <mark.gross@intel.com>
Cc: Su Tao <tao.su@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: <qiuxu.zhuo@intel.com>
Cc: <frank.wang@intel.com>
Cc: <borun.fu@intel.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Mingwei Shi <mingwei.shi@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/entry_32.S                | 2 +-
 arch/x86/entry/vdso/vdso32/system_call.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 0870825a95685..fcad8ac30a8e7 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32)
 	movl	TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
-	pushl	%ecx			/* pt_regs->cx */
+	pushl	%ecx			/* pt_regs->sp (stashed in cx) */
 	pushfl				/* pt_regs->flags (except IF = 0) */
 	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
 	pushl	$__USER_CS		/* pt_regs->cs */
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 93bd8452383f8..8f42b1b9e8dfa 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -1,5 +1,5 @@
 /*
- * Code for the vDSO.  This version uses the old int $0x80 method.
+ * AT_SYSINFO entry point
 */
 
 #include <asm/dwarf2.h>

From 30bfa7b3488bfb1bb75c9f50a5fcac1832970c60 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 16 Dec 2015 23:18:48 -0800
Subject: [PATCH 5/6] x86/entry: Restore traditional SYSENTER calling
 convention

It turns out that some Android versions hardcode the SYSENTER
calling convention.  This is buggy and will cause problems no
matter what the kernel does.  Nonetheless, we should try to
support it.

Credit goes to Linus for pointing out a clean way to handle
the SYSENTER/SYSCALL clobber differences while preserving
straightforward DWARF annotations.

I believe that the original offending Android commit was:

https://android.googlesource.com/platform%2Fbionic/+/7dc3684d7a2587e43e6d2a8e0e3f39bf759bd535

Reported-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-and-tested-by: Borislav Petkov <bp@alien8.de>
Cc: <mark.gross@intel.com>
Cc: Su Tao <tao.su@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: <frank.wang@intel.com>
Cc: <borun.fu@intel.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Mingwei Shi <mingwei.shi@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/common.c                  |  6 +--
 arch/x86/entry/entry_32.S                |  2 +-
 arch/x86/entry/entry_64_compat.S         | 10 ++---
 arch/x86/entry/vdso/vdso32/system_call.S | 52 +++++++++++++++++++-----
 4 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc1f0beb..03663740c8665 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -421,7 +421,7 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
 	regs->ip = landing_pad;
 
 	/*
-	 * Fetch ECX from where the vDSO stashed it.
+	 * Fetch EBP from where the vDSO stashed it.
 	 *
 	 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
 	 */
@@ -432,10 +432,10 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
 		 * Micro-optimization: the pointer we're following is explicitly
 		 * 32 bits, so it can't be out of range.
 		 */
-		__get_user(*(u32 *)&regs->cx,
+		__get_user(*(u32 *)&regs->bp,
 			    (u32 __user __force *)(unsigned long)(u32)regs->sp)
 #else
-		get_user(*(u32 *)&regs->cx,
+		get_user(*(u32 *)&regs->bp,
 			 (u32 __user __force *)(unsigned long)(u32)regs->sp)
 #endif
 		) {
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index fcad8ac30a8e7..f3b6d54e0042b 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32)
 	movl	TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
-	pushl	%ecx			/* pt_regs->sp (stashed in cx) */
+	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
 	pushfl				/* pt_regs->flags (except IF = 0) */
 	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
 	pushl	$__USER_CS		/* pt_regs->cs */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 402e34a21559e..6a1ae3751e824 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -63,7 +63,7 @@ ENTRY(entry_SYSENTER_compat)
 
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER32_DS		/* pt_regs->ss */
-	pushq	%rcx			/* pt_regs->sp */
+	pushq	%rbp			/* pt_regs->sp (stashed in bp) */
 
 	/*
 	 * Push flags.  This is nasty.  First, interrupts are currently
@@ -82,14 +82,14 @@ ENTRY(entry_SYSENTER_compat)
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rdx			/* pt_regs->dx */
-	pushq	%rcx			/* pt_regs->cx (will be overwritten) */
+	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   %r8                     /* pt_regs->r8  = 0 */
 	pushq   %r8                     /* pt_regs->r9  = 0 */
 	pushq   %r8                     /* pt_regs->r10 = 0 */
 	pushq   %r8                     /* pt_regs->r11 = 0 */
 	pushq   %rbx                    /* pt_regs->rbx */
-	pushq   %rbp                    /* pt_regs->rbp */
+	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
 	pushq   %r8                     /* pt_regs->r12 = 0 */
 	pushq   %r8                     /* pt_regs->r13 = 0 */
 	pushq   %r8                     /* pt_regs->r14 = 0 */
@@ -179,7 +179,7 @@ ENTRY(entry_SYSCALL_compat)
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rdx			/* pt_regs->dx */
-	pushq	%rcx			/* pt_regs->cx (will be overwritten) */
+	pushq	%rbp			/* pt_regs->cx (stashed in bp) */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	xorq    %r8,%r8
 	pushq   %r8                     /* pt_regs->r8  = 0 */
@@ -187,7 +187,7 @@ ENTRY(entry_SYSCALL_compat)
 	pushq   %r8                     /* pt_regs->r10 = 0 */
 	pushq   %r8                     /* pt_regs->r11 = 0 */
 	pushq   %rbx                    /* pt_regs->rbx */
-	pushq   %rbp                    /* pt_regs->rbp */
+	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
 	pushq   %r8                     /* pt_regs->r12 = 0 */
 	pushq   %r8                     /* pt_regs->r13 = 0 */
 	pushq   %r8                     /* pt_regs->r14 = 0 */
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 8f42b1b9e8dfa..3a1d9297074bc 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -21,35 +21,67 @@ __kernel_vsyscall:
 	/*
 	 * Reshuffle regs so that all of any of the entry instructions
 	 * will preserve enough state.
+	 *
+	 * A really nice entry sequence would be:
+	 *  pushl %edx
+	 *  pushl %ecx
+	 *  movl  %esp, %ecx
+	 *
+	 * Unfortunately, naughty Android versions between July and December
+	 * 2015 actually hardcode the traditional Linux SYSENTER entry
+	 * sequence.  That is severely broken for a number of reasons (ask
+	 * anyone with an AMD CPU, for example).  Nonetheless, we try to keep
+	 * it working approximately as well as it ever worked.
+	 *
+	 * This link may eludicate some of the history:
+	 *   https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+	 * personally, I find it hard to understand what's going on there.
+	 *
+	 * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+	 * Execute an indirect call to the address in the AT_SYSINFO auxv
+	 * entry.  That is the ONLY correct way to make a fast 32-bit system
+	 * call on Linux.  (Open-coding int $0x80 is also fine, but it's
+	 * slow.)
 	 */
+	pushl	%ecx
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		ecx, 0
 	pushl	%edx
 	CFI_ADJUST_CFA_OFFSET	4
 	CFI_REL_OFFSET		edx, 0
-	pushl	%ecx
+	pushl	%ebp
 	CFI_ADJUST_CFA_OFFSET	4
-	CFI_REL_OFFSET		ecx, 0
-	movl	%esp, %ecx
+	CFI_REL_OFFSET		ebp, 0
+
+	#define SYSENTER_SEQUENCE	"movl %esp, %ebp; sysenter"
+	#define SYSCALL_SEQUENCE	"movl %ecx, %ebp; syscall"
 
 #ifdef CONFIG_X86_64
 	/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
-	ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
-	                  "syscall",  X86_FEATURE_SYSCALL32
+	ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+	                  SYSCALL_SEQUENCE,  X86_FEATURE_SYSCALL32
 #else
-	ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
+	ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
 #endif
 
 	/* Enter using int $0x80 */
-	movl	(%esp), %ecx
 	int	$0x80
 GLOBAL(int80_landing_pad)
 
-	/* Restore ECX and EDX in case they were clobbered. */
-	popl	%ecx
-	CFI_RESTORE		ecx
+	/*
+	 * Restore EDX and ECX in case they were clobbered.  EBP is not
+	 * clobbered (the kernel restores it), but it's cleaner and
+	 * probably faster to pop it than to adjust ESP using addl.
+	 */
+	popl	%ebp
+	CFI_RESTORE		ebp
 	CFI_ADJUST_CFA_OFFSET	-4
 	popl	%edx
 	CFI_RESTORE		edx
 	CFI_ADJUST_CFA_OFFSET	-4
+	popl	%ecx
+	CFI_RESTORE		ecx
+	CFI_ADJUST_CFA_OFFSET	-4
 	ret
 	CFI_ENDPROC
 

From dd7a5ab495019d424c2b0747892eb2e38a052ba5 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@numascale.com>
Date: Thu, 31 Dec 2015 02:06:47 +0800
Subject: [PATCH 6/6] x86/numachip: Fix NumaConnect2 MMCFG PCI access

The MMCFG PCI accessors weren't being setup for NumacConnect2
correctly due to over-early assignment; this would create the
potential for the wrong PCI domain to be accessed.

Fix this by using the correct arch-specific PCI init function.

Signed-off-by: Daniel J Blueman <daniel@numascale.com>
Acked-by: Steffen Persvold <sp@numascale.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1451498807-15920-1-git-send-email-daniel@numascale.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/apic_numachip.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 38dd5efdd04c3..2bd2292a316d4 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -193,20 +193,17 @@ static int __init numachip_system_init(void)
 	case 1:
 		init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
 		numachip_apic_icr_write = numachip1_apic_icr_write;
-		x86_init.pci.arch_init = pci_numachip_init;
 		break;
 	case 2:
 		init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
 		numachip_apic_icr_write = numachip2_apic_icr_write;
-
-		/* Use MCFG config cycles rather than locked CF8 cycles */
-		raw_pci_ops = &pci_mmcfg;
 		break;
 	default:
 		return 0;
 	}
 
 	x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+	x86_init.pci.arch_init = pci_numachip_init;
 
 	return 0;
 }